From 7587a0732c3490401d0b1c7092ed65bab49b80d5 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 2 Nov 2023 19:56:35 -0500 Subject: [PATCH 01/49] add statistics to PartitionedFile --- .../core/src/datasource/file_format/mod.rs | 1 + .../core/src/datasource/listing/helpers.rs | 1 + datafusion/core/src/datasource/listing/mod.rs | 9 +++++++-- .../core/src/datasource/physical_plan/mod.rs | 1 + .../src/datasource/physical_plan/parquet/mod.rs | 3 +++ datafusion/core/src/test_util/parquet.rs | 1 + datafusion/core/tests/parquet/custom_reader.rs | 1 + datafusion/core/tests/parquet/page_pruning.rs | 1 + datafusion/proto/proto/datafusion.proto | 1 + datafusion/proto/src/generated/pbjson.rs | 17 +++++++++++++++++ datafusion/proto/src/generated/prost.rs | 2 ++ .../proto/src/physical_plan/from_proto.rs | 1 + datafusion/proto/src/physical_plan/to_proto.rs | 1 + .../substrait/src/physical_plan/consumer.rs | 1 + 14 files changed, 39 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 72dc289d4b64..686e2174898d 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -150,6 +150,7 @@ pub(crate) mod test_util { object_meta: meta, partition_values: vec![], range: None, + statistics: None, extensions: None, }]]; diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index c53e8df35de8..0753c2134353 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -384,6 +384,7 @@ pub async fn pruned_partition_list<'a>( object_meta, partition_values: partition_values.clone(), range: None, + statistics: None, extensions: None, }) })); diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index b8c279c8a7f1..6728dc4079e4 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -24,7 +24,7 @@ mod url; use crate::error::Result; use chrono::TimeZone; -use datafusion_common::ScalarValue; +use datafusion_common::{ScalarValue, Statistics}; use futures::Stream; use object_store::{path::Path, ObjectMeta}; use std::pin::Pin; @@ -67,6 +67,8 @@ pub struct PartitionedFile { pub partition_values: Vec, /// An optional file range for a more fine-grained parallel execution pub range: Option, + /// Optional statistics that describe the data in this file + pub statistics: Option, /// An optional field for user defined per object metadata pub extensions: Option>, } @@ -83,6 +85,7 @@ impl PartitionedFile { }, partition_values: vec![], range: None, + statistics: None, extensions: None, } } @@ -98,7 +101,8 @@ impl PartitionedFile { version: None, }, partition_values: vec![], - range: None, + range: Some(FileRange { start, end }), + statistics: None, extensions: None, } .with_range(start, end) @@ -128,6 +132,7 @@ impl From for PartitionedFile { object_meta, partition_values: vec![], range: None, + statistics: None, extensions: None, } } diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index ddb8d032f3d8..4b6c51da4729 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -861,6 +861,7 @@ mod tests { object_meta, partition_values: vec![], range: None, + statistics: None, extensions: None, } } diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 2cfbb578da66..983d0495909f 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -1533,6 +1533,7 @@ mod tests { object_meta: meta.clone(), partition_values: vec![], range: Some(FileRange { start, end }), + statistics: None, extensions: None, } } @@ -1634,6 +1635,7 @@ mod tests { ), ], range: None, + statistics: None, extensions: None, }; @@ -1728,6 +1730,7 @@ mod tests { }, partition_values: vec![], range: None, + statistics: None, extensions: None, }; diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 7a466a666d8d..a494efd68ddb 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -151,6 +151,7 @@ impl TestParquetFile { object_meta: self.object_meta.clone(), partition_values: vec![], range: None, + statistics: None, extensions: None, }]], statistics: Statistics::new_unknown(&self.schema), diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 4bacc80579ed..e4f4d229c416 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -69,6 +69,7 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { object_meta: meta, partition_values: vec![], range: None, + statistics: None, extensions: Some(Arc::new(String::from(EXPECTED_USER_DEFINED_METADATA))), }) .collect(); diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 3a43428f5bcf..ed83e53484bc 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -62,6 +62,7 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> ParquetExec { object_meta: meta, partition_values: vec![], range: None, + statistics: None, extensions: None, }; diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index bd4d8b45b152..961bc40fabd3 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -1832,6 +1832,7 @@ message PartitionedFile { uint64 last_modified_ns = 3; repeated ScalarValue partition_values = 4; FileRange range = 5; + Statistics statistics = 6; } message FileRange { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index aaa0764b1e83..690d43e40832 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -17678,6 +17678,9 @@ impl serde::Serialize for PartitionedFile { if self.range.is_some() { len += 1; } + if self.statistics.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.PartitionedFile", len)?; if !self.path.is_empty() { struct_ser.serialize_field("path", &self.path)?; @@ -17696,6 +17699,9 @@ impl serde::Serialize for PartitionedFile { if let Some(v) = self.range.as_ref() { struct_ser.serialize_field("range", v)?; } + if let Some(v) = self.statistics.as_ref() { + struct_ser.serialize_field("statistics", v)?; + } struct_ser.end() } } @@ -17713,6 +17719,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { "partition_values", "partitionValues", "range", + "statistics", ]; #[allow(clippy::enum_variant_names)] @@ -17722,6 +17729,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { LastModifiedNs, PartitionValues, Range, + Statistics, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -17748,6 +17756,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { "lastModifiedNs" | "last_modified_ns" => Ok(GeneratedField::LastModifiedNs), "partitionValues" | "partition_values" => Ok(GeneratedField::PartitionValues), "range" => Ok(GeneratedField::Range), + "statistics" => Ok(GeneratedField::Statistics), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -17772,6 +17781,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { let mut last_modified_ns__ = None; let mut partition_values__ = None; let mut range__ = None; + let mut statistics__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Path => { @@ -17808,6 +17818,12 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { } range__ = map_.next_value()?; } + GeneratedField::Statistics => { + if statistics__.is_some() { + return Err(serde::de::Error::duplicate_field("statistics")); + } + statistics__ = map_.next_value()?; + } } } Ok(PartitionedFile { @@ -17816,6 +17832,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile { last_modified_ns: last_modified_ns__.unwrap_or_default(), partition_values: partition_values__.unwrap_or_default(), range: range__, + statistics: statistics__, }) } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 07a0f30a2f68..bfb1ca42607b 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2692,6 +2692,8 @@ pub struct PartitionedFile { pub partition_values: ::prost::alloc::vec::Vec, #[prost(message, optional, tag = "5")] pub range: ::core::option::Option, + #[prost(message, optional, tag = "6")] + pub statistics: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 16f0e94cad83..5a742d643ed8 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -576,6 +576,7 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile { .map(|v| v.try_into()) .collect::, _>>()?, range: val.range.as_ref().map(|v| v.try_into()).transpose()?, + statistics: val.statistics.as_ref().map(|v| v.try_into()).transpose()?, extensions: None, }) } diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index bdb6cc668708..1b9f33ccf981 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -606,6 +606,7 @@ impl TryFrom<&PartitionedFile> for protobuf::PartitionedFile { .map(|v| v.try_into()) .collect::, _>>()?, range: pf.range.as_ref().map(|r| r.try_into()).transpose()?, + statistics: pf.statistics.as_ref().map(|s| s.into()), }) } } diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 11ddb91ad391..50b08e7793f0 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -93,6 +93,7 @@ pub async fn from_substrait_rel( }, partition_values: vec![], range: None, + statistics: None, extensions: None, }; From 1e380b206dfb52f93481b3f1dd38a0e9d68fdf82 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 2 Nov 2023 23:49:02 -0500 Subject: [PATCH 02/49] just dump work for now --- .../physical_plan/file_scan_config.rs | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 370ca91a0b0e..2cdbff09d7f9 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -194,6 +194,104 @@ impl FileScanConfig { .with_repartition_file_min_size(repartition_file_min_size) .repartition_file_groups(&file_groups) } + + #[allow(unused)] + fn sort_file_groups( + table_schema: &Schema, + file_groups: Vec>, + target_partitions: usize, + sort_order: LexOrdering, + ) -> Option>> { + use arrow::row::{RowConverter, SortField}; + let sort_columns = sort_order + .iter() + .map(|sort_expr| { + sort_expr + .expr + .as_any() + .downcast_ref::() + }) + .collect::>>()?; + let flattened_files = file_groups.iter().flatten().collect::>(); + let statistics_and_partition_values = flattened_files + .iter() + .map(|file| { + (file + .statistics + .as_ref() + .zip(Some(file.partition_values.as_slice()))) + }) + .collect::>>()?; + + let min_values = + get_statistic(table_schema, &statistics_and_partition_values, |s| { + s.min_value.get_value().unwrap().clone() + }) + .ok()?; + + let max_values = + get_statistic(table_schema, &statistics_and_partition_values, |s| { + s.max_value.get_value().unwrap().clone() + }) + .ok()?; + + let min_values_batch = + RecordBatch::try_new(Arc::new(table_schema.to_owned()), min_values).ok()?; + + use arrow::compute::lexsort_to_indices; + let columns = sort_order + .iter() + .map(|expr| expr.evaluate_to_sort_column(&min_values_batch)) + .collect::>>() + .ok()?; + let sorted_idx = lexsort_to_indices(&columns, None).ok()?; + + let mut file_groups: Vec> = vec![]; + + // first fit + + for &idx in sorted_idx.values() { + let file = flattened_files[idx as usize]; + for file_group in &mut file_groups { + match file_group.last() { + Some(file) => { + // if file.statistics.unwrap().col + } + None => { + file_group.push(file.clone()); + break; + } + } + } + + file_groups.push(vec![file.clone()]); + } + + Some(file_groups) + } +} + +fn get_statistic( + table_schema: &Schema, + statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], + f: impl Fn(&ColumnStatistics) -> ScalarValue, +) -> Result> { + table_schema + .all_fields() + .into_iter() + .enumerate() + .map(|(i, _field)| { + ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( + |(s, pv)| { + if i < s.column_statistics.len() { + f(&s.column_statistics[i]) + } else { + pv[i - s.column_statistics.len()].clone() + } + }, + )) + }) + .collect::>>() } /// A helper that projects partition columns into the file record batches. From 263453f4d59b8ac6453ae538e0e6fff55b213c5e Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 3 Nov 2023 12:58:03 -0500 Subject: [PATCH 03/49] working test case --- .../physical_plan/file_scan_config.rs | 237 ++++++++++++++---- 1 file changed, 192 insertions(+), 45 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 2cdbff09d7f9..e45719f0d8f7 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -26,9 +26,12 @@ use super::{get_projected_output_ordering, FileGroupPartitioner}; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; use crate::{error::Result, scalar::ScalarValue}; -use arrow::array::{ArrayData, BufferBuilder}; use arrow::buffer::Buffer; use arrow::datatypes::{ArrowNativeType, UInt16Type}; +use arrow::{ + array::{ArrayData, BufferBuilder}, + compute::concat_batches, +}; use arrow_array::{ArrayRef, DictionaryArray, RecordBatch, RecordBatchOptions}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use datafusion_common::stats::Precision; @@ -199,7 +202,6 @@ impl FileScanConfig { fn sort_file_groups( table_schema: &Schema, file_groups: Vec>, - target_partitions: usize, sort_order: LexOrdering, ) -> Option>> { use arrow::row::{RowConverter, SortField}; @@ -235,65 +237,80 @@ impl FileScanConfig { }) .ok()?; - let min_values_batch = - RecordBatch::try_new(Arc::new(table_schema.to_owned()), min_values).ok()?; + // first fit + use arrow::row::*; - use arrow::compute::lexsort_to_indices; - let columns = sort_order - .iter() - .map(|expr| expr.evaluate_to_sort_column(&min_values_batch)) - .collect::>>() - .ok()?; - let sorted_idx = lexsort_to_indices(&columns, None).ok()?; + let min_and_max_values = + concat_batches(&min_values.schema(), &[min_values, max_values]).ok()?; + let rows = { + let sorting_columns = sort_order + .iter() + .map(|expr| expr.evaluate_to_sort_column(&min_and_max_values)) + .collect::>>() + .ok()?; + let sort_fields = sorting_columns + .iter() + .map(|c| match c.options { + Some(opts) => { + SortField::new_with_options(c.values.data_type().clone(), opts) + } + None => SortField::new(c.values.data_type().clone()), + }) + .collect_vec(); + + let converter = RowConverter::new(sort_fields).ok()?; + converter + .convert_columns( + &sorting_columns.into_iter().map(|c| c.values).collect_vec(), + ) + .ok()? + }; - let mut file_groups: Vec> = vec![]; + let indices_sorted_by_min = { + let mut sort: Vec<_> = + rows.iter().take(rows.num_rows() / 2).enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + sort.iter().map(|(i, _)| *i).collect_vec() + }; - // first fit + let mut file_groups_indices: Vec> = vec![]; - for &idx in sorted_idx.values() { - let file = flattened_files[idx as usize]; - for file_group in &mut file_groups { + 'outer: for idx in indices_sorted_by_min { + let min = rows.row(idx); + for file_group in &mut file_groups_indices { match file_group.last() { - Some(file) => { - // if file.statistics.unwrap().col + Some(&idx_other) => { + let other_max = rows.row(flattened_files.len() + idx_other); + if min >= other_max { + file_group.push(idx); + continue 'outer; + } } None => { - file_group.push(file.clone()); - break; + file_group.push(idx); + continue 'outer; } + _ => {} } } - file_groups.push(vec![file.clone()]); + file_groups_indices.push(vec![idx]); } - Some(file_groups) + Some( + file_groups_indices + .into_iter() + .map(|file_group_indices| { + file_group_indices + .into_iter() + .map(|idx| flattened_files[idx].clone()) + .collect_vec() + }) + .collect_vec(), + ) } } -fn get_statistic( - table_schema: &Schema, - statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], - f: impl Fn(&ColumnStatistics) -> ScalarValue, -) -> Result> { - table_schema - .all_fields() - .into_iter() - .enumerate() - .map(|(i, _field)| { - ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( - |(s, pv)| { - if i < s.column_statistics.len() { - f(&s.column_statistics[i]) - } else { - pv[i - s.column_statistics.len()].clone() - } - }, - )) - }) - .collect::>>() -} - /// A helper that projects partition columns into the file record batches. /// /// One interesting trick is the usage of a cache for the key buffers of the partition column @@ -534,9 +551,38 @@ fn create_output_array( val.to_array_of_size(len) } +fn get_statistic( + table_schema: &Schema, + statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], + f: impl Fn(&ColumnStatistics) -> ScalarValue, +) -> Result { + RecordBatch::try_new( + Arc::new(table_schema.to_owned()), + table_schema + .all_fields() + .into_iter() + .enumerate() + .map(|(i, _field)| { + ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( + |(s, pv)| { + if i < s.column_statistics.len() { + f(&s.column_statistics[i]) + } else { + pv[i - s.column_statistics.len()].clone() + } + }, + )) + }) + .collect::>>()?, + ) + .map_err(From::from) +} + #[cfg(test)] mod tests { use arrow_array::Int32Array; + use arrow_schema::SortOptions; + use datafusion_physical_expr::PhysicalSortExpr; use super::*; use crate::{test::columns, test_util::aggr_test_schema}; @@ -858,6 +904,107 @@ mod tests { // Assert projected file schema is equal to file schema assert_eq!(projection.fields(), schema.fields()); + fn test_sort_file_groups() { + use chrono::TimeZone; + use object_store::{path::Path, ObjectMeta}; + let file_groups = vec![ + vec![PartitionedFile { + object_meta: ObjectMeta { + location: Path::from("file1"), + last_modified: chrono::Utc.timestamp_nanos(0), + size: 0, + e_tag: None, + version: None, + }, + + partition_values: vec![ScalarValue::from("2023-01-01")], + range: None, + statistics: Some(Statistics { + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: vec![ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from(0.0)), + max_value: Precision::Exact(ScalarValue::from(0.49)), + ..Default::default() + }], + }), + extensions: None, + }], + vec![PartitionedFile { + object_meta: ObjectMeta { + location: Path::from("file2"), + last_modified: chrono::Utc.timestamp_nanos(0), + size: 0, + e_tag: None, + version: None, + }, + partition_values: vec![ScalarValue::from("2023-01-01")], + range: None, + statistics: Some(Statistics { + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: vec![ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from(0.5)), + max_value: Precision::Exact(ScalarValue::from(1.0)), + ..Default::default() + }], + }), + extensions: None, + }], + vec![PartitionedFile { + object_meta: ObjectMeta { + location: Path::from("file3"), + last_modified: chrono::Utc.timestamp_nanos(0), + size: 0, + e_tag: None, + version: None + }, + partition_values: vec![ScalarValue::from("2023-01-02")], + range: None, + statistics: Some(Statistics { + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: vec![ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from(0.0)), + max_value: Precision::Exact(ScalarValue::from(1.0)), + ..Default::default() + }], + }), + extensions: None, + }], + ]; + + let table_schema = Schema::new(vec![ + Field::new("value".to_string(), DataType::Float64, false), + Field::new("date".to_string(), DataType::Utf8, false), + ]); + + use datafusion_physical_expr::expressions::Column; + let sort_order_by_value = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("value", 0)), + options: SortOptions { + descending: false, + nulls_first: false, + }, + }]; + + let results = FileScanConfig::sort_file_groups( + &table_schema, + file_groups.clone(), + sort_order_by_value, + ) + .unwrap(); + + let paths = results + .iter() + .map(|file_group| { + file_group + .iter() + .map(|file| file.object_meta.location.as_ref()) + .collect_vec() + }) + .collect_vec(); + assert_eq!(paths, vec![vec!["file1", "file2"], vec!["file3"]]); } // sets default for configs that play no role in projections From 5634bd77bf4e7ca711a930a7b6bea80b13bff5cd Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:54:35 -0500 Subject: [PATCH 04/49] fix jumbled rebase --- .../core/src/datasource/physical_plan/file_scan_config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index e45719f0d8f7..cf6e2388ac10 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -38,6 +38,7 @@ use datafusion_common::stats::Precision; use datafusion_common::{exec_err, ColumnStatistics, Statistics}; use datafusion_physical_expr::LexOrdering; +use itertools::Itertools; use log::warn; /// Convert type to a type suitable for use as a [`ListingTable`] @@ -904,6 +905,8 @@ mod tests { // Assert projected file schema is equal to file schema assert_eq!(projection.fields(), schema.fields()); + } + fn test_sort_file_groups() { use chrono::TimeZone; use object_store::{path::Path, ObjectMeta}; @@ -957,7 +960,7 @@ mod tests { last_modified: chrono::Utc.timestamp_nanos(0), size: 0, e_tag: None, - version: None + version: None, }, partition_values: vec![ScalarValue::from("2023-01-02")], range: None, From 7428fe0a7540a215424663e4c0c0e0b4563890e8 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:08:49 -0500 Subject: [PATCH 05/49] forgot to annotate #[test] --- datafusion/core/src/datasource/physical_plan/file_scan_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index cf6e2388ac10..79bea164a53b 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -907,6 +907,7 @@ mod tests { assert_eq!(projection.fields(), schema.fields()); } + #[test] fn test_sort_file_groups() { use chrono::TimeZone; use object_store::{path::Path, ObjectMeta}; From 481634390936d6333a1d53c738e1afd5f20009e4 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 01:20:00 -0500 Subject: [PATCH 06/49] more refactoring --- .../physical_plan/file_scan_config.rs | 209 ++++++++++-------- 1 file changed, 117 insertions(+), 92 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 79bea164a53b..39450133e645 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -26,17 +26,14 @@ use super::{get_projected_output_ordering, FileGroupPartitioner}; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; use crate::{error::Result, scalar::ScalarValue}; +use arrow::array::{ArrayData, BufferBuilder}; use arrow::buffer::Buffer; use arrow::datatypes::{ArrowNativeType, UInt16Type}; -use arrow::{ - array::{ArrayData, BufferBuilder}, - compute::concat_batches, -}; use arrow_array::{ArrayRef, DictionaryArray, RecordBatch, RecordBatchOptions}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use datafusion_common::stats::Precision; use datafusion_common::{exec_err, ColumnStatistics, Statistics}; -use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use itertools::Itertools; use log::warn; @@ -201,20 +198,10 @@ impl FileScanConfig { #[allow(unused)] fn sort_file_groups( - table_schema: &Schema, + table_schema: &SchemaRef, file_groups: Vec>, sort_order: LexOrdering, ) -> Option>> { - use arrow::row::{RowConverter, SortField}; - let sort_columns = sort_order - .iter() - .map(|sort_expr| { - sort_expr - .expr - .as_any() - .downcast_ref::() - }) - .collect::>>()?; let flattened_files = file_groups.iter().flatten().collect::>(); let statistics_and_partition_values = flattened_files .iter() @@ -226,63 +213,47 @@ impl FileScanConfig { }) .collect::>>()?; - let min_values = - get_statistic(table_schema, &statistics_and_partition_values, |s| { - s.min_value.get_value().unwrap().clone() - }) - .ok()?; - - let max_values = - get_statistic(table_schema, &statistics_and_partition_values, |s| { - s.max_value.get_value().unwrap().clone() - }) - .ok()?; - - // first fit - use arrow::row::*; - - let min_and_max_values = - concat_batches(&min_values.schema(), &[min_values, max_values]).ok()?; - let rows = { - let sorting_columns = sort_order - .iter() - .map(|expr| expr.evaluate_to_sort_column(&min_and_max_values)) - .collect::>>() + let min_values = project_statistic( + table_schema, + &statistics_and_partition_values, + MinOrMax::Min, + ) + .ok()?; + let max_values = project_statistic( + table_schema, + &statistics_and_partition_values, + MinOrMax::Max, + ) + .ok()?; + + // First Fit: + // * Choose the first file group that a file can be placed into. + // * If it fits into no existing file groups, create a new one. + // + // By sorting files by min values and then applying first-fit bin packing, + // we can produce the smallest number of file groups such that + // files within a group are in order and non-overlapping. + // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 + + let statistics = + MinMaxStatistics::new(&sort_order, table_schema, min_values, max_values) .ok()?; - let sort_fields = sorting_columns - .iter() - .map(|c| match c.options { - Some(opts) => { - SortField::new_with_options(c.values.data_type().clone(), opts) - } - None => SortField::new(c.values.data_type().clone()), - }) - .collect_vec(); - - let converter = RowConverter::new(sort_fields).ok()?; - converter - .convert_columns( - &sorting_columns.into_iter().map(|c| c.values).collect_vec(), - ) - .ok()? - }; let indices_sorted_by_min = { - let mut sort: Vec<_> = - rows.iter().take(rows.num_rows() / 2).enumerate().collect(); + let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - sort.iter().map(|(i, _)| *i).collect_vec() + sort }; let mut file_groups_indices: Vec> = vec![]; - 'outer: for idx in indices_sorted_by_min { - let min = rows.row(idx); + 'outer: for (idx, min) in indices_sorted_by_min { for file_group in &mut file_groups_indices { match file_group.last() { - Some(&idx_other) => { - let other_max = rows.row(flattened_files.len() + idx_other); - if min >= other_max { + Some(&other) => { + // If our file is non-overlapping and comes _after_ the last file, + // it fits in this file group. + if min >= statistics.max.row(other) { file_group.push(idx); continue 'outer; } @@ -312,6 +283,87 @@ impl FileScanConfig { } } +enum MinOrMax { + Min, + Max, +} + +// A helper to read statistics from PartitionedFiles into a RecordBatch for further processing. +fn project_statistic( + table_schema: &Schema, + statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], + statistic: MinOrMax, +) -> Result { + RecordBatch::try_new( + Arc::new(table_schema.to_owned()), + table_schema + .all_fields() + .into_iter() + .enumerate() + .map(|(i, _field)| { + ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( + |(s, pv)| { + if i < s.column_statistics.len() { + match statistic { + MinOrMax::Min => &s.column_statistics[i].min_value, + MinOrMax::Max => &s.column_statistics[i].max_value, + } + .get_value() + .cloned() + .unwrap_or(ScalarValue::Null) + } else { + pv[i - s.column_statistics.len()].clone() + } + }, + )) + }) + .collect::>>()?, + ) + .map_err(From::from) +} + +// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +struct MinMaxStatistics { + min: arrow::row::Rows, + max: arrow::row::Rows, +} + +impl MinMaxStatistics { + fn new( + sort_order: &[PhysicalSortExpr], + schema: &SchemaRef, + min_values: RecordBatch, + max_values: RecordBatch, + ) -> Result { + use arrow::row::*; + + let sort_fields = sort_order + .iter() + .map(|expr| { + expr.expr + .data_type(schema) + .map(|data_type| SortField::new_with_options(data_type, expr.options)) + }) + .collect::>>()?; + let converter = RowConverter::new(sort_fields)?; + + let [min, max] = [min_values, max_values].map(|values| { + let sorting_columns = sort_order + .iter() + .map(|expr| expr.evaluate_to_sort_column(&values)) + .collect::>>()?; + converter.convert_columns( + &sorting_columns.into_iter().map(|c| c.values).collect_vec(), + ) + }); + + Ok(Self { + min: min?, + max: max?, + }) + } +} + /// A helper that projects partition columns into the file record batches. /// /// One interesting trick is the usage of a cache for the key buffers of the partition column @@ -552,33 +604,6 @@ fn create_output_array( val.to_array_of_size(len) } -fn get_statistic( - table_schema: &Schema, - statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], - f: impl Fn(&ColumnStatistics) -> ScalarValue, -) -> Result { - RecordBatch::try_new( - Arc::new(table_schema.to_owned()), - table_schema - .all_fields() - .into_iter() - .enumerate() - .map(|(i, _field)| { - ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( - |(s, pv)| { - if i < s.column_statistics.len() { - f(&s.column_statistics[i]) - } else { - pv[i - s.column_statistics.len()].clone() - } - }, - )) - }) - .collect::>>()?, - ) - .map_err(From::from) -} - #[cfg(test)] mod tests { use arrow_array::Int32Array; @@ -978,10 +1003,10 @@ mod tests { }], ]; - let table_schema = Schema::new(vec![ + let table_schema = Arc::new(Schema::new(vec![ Field::new("value".to_string(), DataType::Float64, false), Field::new("date".to_string(), DataType::Utf8, false), - ]); + ])); use datafusion_physical_expr::expressions::Column; let sort_order_by_value = vec![PhysicalSortExpr { From c7be9e0348108fc5e5dbcd37e3923462e72e8fb5 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 01:20:51 -0500 Subject: [PATCH 07/49] add a link --- datafusion/core/src/datasource/physical_plan/file_scan_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 39450133e645..e0edeccb9adc 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -234,6 +234,7 @@ impl FileScanConfig { // we can produce the smallest number of file groups such that // files within a group are in order and non-overlapping. // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 + // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html let statistics = MinMaxStatistics::new(&sort_order, table_schema, min_values, max_values) From fc1a66890240ae4f937c6503a1ce8074ed5c01ef Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 02:59:28 -0500 Subject: [PATCH 08/49] refactor again --- .../physical_plan/file_scan_config.rs | 178 +++++------------- .../core/src/datasource/physical_plan/mod.rs | 147 ++++++++++++++- 2 files changed, 185 insertions(+), 140 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index e0edeccb9adc..f96456325f7b 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -22,7 +22,7 @@ use std::{ borrow::Cow, collections::HashMap, fmt::Debug, marker::PhantomData, sync::Arc, vec, }; -use super::{get_projected_output_ordering, FileGroupPartitioner}; +use super::{get_projected_output_ordering, FileGroupPartitioner, MinMaxStatistics}; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; use crate::{error::Result, scalar::ScalarValue}; @@ -139,12 +139,21 @@ impl FileScanConfig { column_statistics: table_cols_stats, }; - let table_schema = Arc::new( + let projected_schema = Arc::new( Schema::new(table_fields).with_metadata(self.file_schema.metadata().clone()), ); + + let full_table_schema = { + let mut all_table_fields: Vec<_> = + self.file_schema.all_fields().into_iter().cloned().collect(); + all_table_fields.extend(self.table_partition_cols.clone()); + Arc::new(Schema::new(all_table_fields)) + }; + let projected_output_ordering = - get_projected_output_ordering(self, &table_schema); - (table_schema, table_stats, projected_output_ordering) + get_projected_output_ordering(self, &projected_schema, &full_table_schema); + + (projected_schema, table_stats, projected_output_ordering) } #[allow(unused)] // Only used by avro @@ -196,36 +205,14 @@ impl FileScanConfig { .repartition_file_groups(&file_groups) } - #[allow(unused)] - fn sort_file_groups( + // TODO: documentation + pub fn sort_file_groups( table_schema: &SchemaRef, - file_groups: Vec>, - sort_order: LexOrdering, + projected_schema: &SchemaRef, + file_groups: &[Vec], + sort_order: &[PhysicalSortExpr], ) -> Option>> { let flattened_files = file_groups.iter().flatten().collect::>(); - let statistics_and_partition_values = flattened_files - .iter() - .map(|file| { - (file - .statistics - .as_ref() - .zip(Some(file.partition_values.as_slice()))) - }) - .collect::>>()?; - - let min_values = project_statistic( - table_schema, - &statistics_and_partition_values, - MinOrMax::Min, - ) - .ok()?; - let max_values = project_statistic( - table_schema, - &statistics_and_partition_values, - MinOrMax::Max, - ) - .ok()?; - // First Fit: // * Choose the first file group that a file can be placed into. // * If it fits into no existing file groups, create a new one. @@ -236,9 +223,13 @@ impl FileScanConfig { // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html - let statistics = - MinMaxStatistics::new(&sort_order, table_schema, min_values, max_values) - .ok()?; + let statistics = MinMaxStatistics::new_from_files( + sort_order, + table_schema, + projected_schema, + flattened_files.iter().copied(), + ) + .ok()?; let indices_sorted_by_min = { let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); @@ -248,28 +239,23 @@ impl FileScanConfig { let mut file_groups_indices: Vec> = vec![]; - 'outer: for (idx, min) in indices_sorted_by_min { - for file_group in &mut file_groups_indices { - match file_group.last() { - Some(&other) => { - // If our file is non-overlapping and comes _after_ the last file, - // it fits in this file group. - if min >= statistics.max.row(other) { - file_group.push(idx); - continue 'outer; - } - } - None => { - file_group.push(idx); - continue 'outer; - } - _ => {} - } + for (idx, min) in indices_sorted_by_min { + let file_group_to_insert = file_groups_indices.iter_mut().find(|group| { + // If our file is non-overlapping and comes _after_ the last file, + // it fits in this file group. + min > statistics.max.row( + *group + .last() + .expect("groups should be nonempty at construction"), + ) + }); + match file_group_to_insert { + Some(group) => group.push(idx), + None => file_groups_indices.push(vec![idx]), } - - file_groups_indices.push(vec![idx]); } + // Assemble indices back into groups of PartitionedFiles Some( file_groups_indices .into_iter() @@ -284,87 +270,6 @@ impl FileScanConfig { } } -enum MinOrMax { - Min, - Max, -} - -// A helper to read statistics from PartitionedFiles into a RecordBatch for further processing. -fn project_statistic( - table_schema: &Schema, - statistics_and_partition_values: &[(&Statistics, &[ScalarValue])], - statistic: MinOrMax, -) -> Result { - RecordBatch::try_new( - Arc::new(table_schema.to_owned()), - table_schema - .all_fields() - .into_iter() - .enumerate() - .map(|(i, _field)| { - ScalarValue::iter_to_array(statistics_and_partition_values.iter().map( - |(s, pv)| { - if i < s.column_statistics.len() { - match statistic { - MinOrMax::Min => &s.column_statistics[i].min_value, - MinOrMax::Max => &s.column_statistics[i].max_value, - } - .get_value() - .cloned() - .unwrap_or(ScalarValue::Null) - } else { - pv[i - s.column_statistics.len()].clone() - } - }, - )) - }) - .collect::>>()?, - ) - .map_err(From::from) -} - -// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. -struct MinMaxStatistics { - min: arrow::row::Rows, - max: arrow::row::Rows, -} - -impl MinMaxStatistics { - fn new( - sort_order: &[PhysicalSortExpr], - schema: &SchemaRef, - min_values: RecordBatch, - max_values: RecordBatch, - ) -> Result { - use arrow::row::*; - - let sort_fields = sort_order - .iter() - .map(|expr| { - expr.expr - .data_type(schema) - .map(|data_type| SortField::new_with_options(data_type, expr.options)) - }) - .collect::>>()?; - let converter = RowConverter::new(sort_fields)?; - - let [min, max] = [min_values, max_values].map(|values| { - let sorting_columns = sort_order - .iter() - .map(|expr| expr.evaluate_to_sort_column(&values)) - .collect::>>()?; - converter.convert_columns( - &sorting_columns.into_iter().map(|c| c.values).collect_vec(), - ) - }); - - Ok(Self { - min: min?, - max: max?, - }) - } -} - /// A helper that projects partition columns into the file record batches. /// /// One interesting trick is the usage of a cache for the key buffers of the partition column @@ -1020,8 +925,9 @@ mod tests { let results = FileScanConfig::sort_file_groups( &table_schema, - file_groups.clone(), - sort_order_by_value, + &table_schema, + &file_groups, + &sort_order_by_value, ) .unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 4b6c51da4729..9c646bd00994 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -66,7 +66,7 @@ use arrow::{ datatypes::{DataType, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, }; -use datafusion_common::plan_err; +use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; @@ -448,14 +448,36 @@ impl From for FileMeta { fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, + table_schema: &SchemaRef, ) -> Vec> { let mut all_orderings = vec![]; for output_ordering in &base_config.output_ordering { - if base_config.file_groups.iter().any(|group| group.len() > 1) { - debug!("Skipping specified output ordering {:?}. Some file group had more than one file: {:?}", - base_config.output_ordering[0], base_config.file_groups); + // Check if all file groups are sorted + if base_config.file_groups.iter().all(|group| { + if group.len() <= 1 { + return true; + } + + let statistics = match MinMaxStatistics::new_from_files( + output_ordering, + table_schema, + projected_schema, + group, + ) { + Ok(statistics) => statistics, + Err(e) => { + log::debug!("Error fetching statistics for file group: {e}"); + return false; + } + }; + + statistics.is_sorted() + }) { + debug!("Skipping specified output ordering {:?}. Some file group couldn't be determined to be sorted: {:?}", + base_config.output_ordering[0], base_config.file_groups); return vec![]; } + let mut new_ordering = vec![]; for PhysicalSortExpr { expr, options } in output_ordering { if let Some(col) = expr.as_any().downcast_ref::() { @@ -482,6 +504,123 @@ fn get_projected_output_ordering( all_orderings } +// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +pub(crate) struct MinMaxStatistics { + min: arrow::row::Rows, + max: arrow::row::Rows, +} + +impl MinMaxStatistics { + fn new_from_files<'a>( + sort_order: &[PhysicalSortExpr], + table_schema: &SchemaRef, + projected_schema: &SchemaRef, + files: impl IntoIterator, + ) -> Result { + use datafusion_common::ScalarValue; + + let statistics_and_partition_values = files + .into_iter() + .map(|file| { + file.statistics + .as_ref() + .zip(Some(file.partition_values.as_slice())) + }) + .collect::>>() + .ok_or_else(|| { + DataFusionError::Plan("Parquet file missing statistics".to_string()) + })?; + + let get_min_max = |i: usize| -> (Vec, Vec) { + statistics_and_partition_values + .iter() + .map(|(s, pv)| { + if i < s.column_statistics.len() { + ( + s.column_statistics[i] + .min_value + .get_value() + .cloned() + .unwrap_or(ScalarValue::Null), + s.column_statistics[i] + .max_value + .get_value() + .cloned() + .unwrap_or(ScalarValue::Null), + ) + } else { + let partition_value = &pv[i - s.column_statistics.len()]; + (partition_value.clone(), partition_value.clone()) + } + }) + .unzip() + }; + + let (min_values, max_values): (Vec<_>, Vec<_>) = projected_schema + .fields() + .iter() + .map(|field| { + let (min, max) = get_min_max(table_schema.index_of(field.name())?); + Ok(( + (field.name(), ScalarValue::iter_to_array(min)?), + (field.name(), ScalarValue::iter_to_array(max)?), + )) + }) + .collect::>>()? + .into_iter() + .unzip(); + + Self::new( + sort_order, + RecordBatch::try_from_iter(min_values)?, + RecordBatch::try_from_iter(max_values)?, + ) + } + + fn new( + sort_order: &[PhysicalSortExpr], + min_values: RecordBatch, + max_values: RecordBatch, + ) -> Result { + use arrow::row::*; + + let sort_fields = sort_order + .iter() + .map(|expr| { + expr.expr + .data_type(&min_values.schema()) + .map(|data_type| SortField::new_with_options(data_type, expr.options)) + }) + .collect::>>()?; + let converter = RowConverter::new(sort_fields)?; + + let [min, max] = [min_values, max_values].map(|values| { + let sorting_columns = sort_order + .iter() + .map(|expr| expr.evaluate_to_sort_column(&values)) + .collect::>>()?; + converter.convert_columns( + &sorting_columns + .into_iter() + .map(|c| c.values) + .collect::>(), + ) + }); + + Ok(Self { + min: min?, + max: max?, + }) + } + + fn is_sorted(&self) -> bool { + self.max + .iter() + .zip(self.min.iter().skip(1)) + .all(|(max, next_min)| max < next_min) + } +} + /// Represents the possible outcomes of a range calculation. /// /// This enum is used to encapsulate the result of calculating the range of From 1c42e0006d186e6abe7e6c270baf2b54ede91e38 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 03:00:47 -0500 Subject: [PATCH 09/49] whitespace --- datafusion/core/src/datasource/physical_plan/file_scan_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index f96456325f7b..b3fff0d609c0 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -220,6 +220,7 @@ impl FileScanConfig { // By sorting files by min values and then applying first-fit bin packing, // we can produce the smallest number of file groups such that // files within a group are in order and non-overlapping. + // // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html From 3446fed671a0bdfe4298612fb053c517758f7959 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 03:01:40 -0500 Subject: [PATCH 10/49] format debug log --- datafusion/core/src/datasource/physical_plan/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 9c646bd00994..baa74bb0dd9b 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -473,8 +473,11 @@ fn get_projected_output_ordering( statistics.is_sorted() }) { - debug!("Skipping specified output ordering {:?}. Some file group couldn't be determined to be sorted: {:?}", - base_config.output_ordering[0], base_config.file_groups); + debug!( + "Skipping specified output ordering {:?}. \ + Some file group couldn't be determined to be sorted: {:?}", + base_config.output_ordering[0], base_config.file_groups + ); return vec![]; } From 3fe85581c1fb1bb731dd87851780fa41420a0a03 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 03:08:38 -0500 Subject: [PATCH 11/49] remove useless itertools --- .../core/src/datasource/physical_plan/file_scan_config.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index b3fff0d609c0..ba41b4431fbb 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -35,7 +35,6 @@ use datafusion_common::stats::Precision; use datafusion_common::{exec_err, ColumnStatistics, Statistics}; use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; -use itertools::Itertools; use log::warn; /// Convert type to a type suitable for use as a [`ListingTable`] @@ -264,9 +263,9 @@ impl FileScanConfig { file_group_indices .into_iter() .map(|idx| flattened_files[idx].clone()) - .collect_vec() + .collect() }) - .collect_vec(), + .collect(), ) } } From 8ba40018b174f486b651e129a50888ca17e65938 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:59:45 -0500 Subject: [PATCH 12/49] refactor test --- .../physical_plan/file_scan_config.rs | 228 +++++++++--------- 1 file changed, 120 insertions(+), 108 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index ba41b4431fbb..3b7571dcac40 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -204,13 +204,15 @@ impl FileScanConfig { .repartition_file_groups(&file_groups) } - // TODO: documentation + /// Attempts to do a bin-packing on files into file groups, such that any two files + /// in a file group are ordered and non-overlapping with respect to their statistics. + /// It will produce the smallest number of file groups possible. pub fn sort_file_groups( table_schema: &SchemaRef, projected_schema: &SchemaRef, file_groups: &[Vec], sort_order: &[PhysicalSortExpr], - ) -> Option>> { + ) -> Result>> { let flattened_files = file_groups.iter().flatten().collect::>(); // First Fit: // * Choose the first file group that a file can be placed into. @@ -228,8 +230,7 @@ impl FileScanConfig { table_schema, projected_schema, flattened_files.iter().copied(), - ) - .ok()?; + )?; let indices_sorted_by_min = { let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); @@ -256,17 +257,15 @@ impl FileScanConfig { } // Assemble indices back into groups of PartitionedFiles - Some( - file_groups_indices - .into_iter() - .map(|file_group_indices| { - file_group_indices - .into_iter() - .map(|idx| flattened_files[idx].clone()) - .collect() - }) - .collect(), - ) + Ok(file_groups_indices + .into_iter() + .map(|file_group_indices| { + file_group_indices + .into_iter() + .map(|idx| flattened_files[idx].clone()) + .collect() + }) + .collect()) } } @@ -513,8 +512,6 @@ fn create_output_array( #[cfg(test)] mod tests { use arrow_array::Int32Array; - use arrow_schema::SortOptions; - use datafusion_physical_expr::PhysicalSortExpr; use super::*; use crate::{test::columns, test_util::aggr_test_schema}; @@ -839,108 +836,123 @@ mod tests { } #[test] - fn test_sort_file_groups() { + fn test_sort_file_groups() -> Result<()> { use chrono::TimeZone; + use datafusion_common::DFSchema; + use datafusion_expr::execution_props::ExecutionProps; use object_store::{path::Path, ObjectMeta}; - let file_groups = vec![ - vec![PartitionedFile { - object_meta: ObjectMeta { - location: Path::from("file1"), - last_modified: chrono::Utc.timestamp_nanos(0), - size: 0, - e_tag: None, - version: None, - }, - - partition_values: vec![ScalarValue::from("2023-01-01")], - range: None, - statistics: Some(Statistics { - num_rows: Precision::Absent, - total_byte_size: Precision::Absent, - column_statistics: vec![ColumnStatistics { - min_value: Precision::Exact(ScalarValue::from(0.0)), - max_value: Precision::Exact(ScalarValue::from(0.49)), - ..Default::default() - }], - }), - extensions: None, - }], - vec![PartitionedFile { - object_meta: ObjectMeta { - location: Path::from("file2"), - last_modified: chrono::Utc.timestamp_nanos(0), - size: 0, - e_tag: None, - version: None, - }, - partition_values: vec![ScalarValue::from("2023-01-01")], - range: None, - statistics: Some(Statistics { - num_rows: Precision::Absent, - total_byte_size: Precision::Absent, - column_statistics: vec![ColumnStatistics { - min_value: Precision::Exact(ScalarValue::from(0.5)), - max_value: Precision::Exact(ScalarValue::from(1.0)), - ..Default::default() - }], - }), - extensions: None, - }], - vec![PartitionedFile { - object_meta: ObjectMeta { - location: Path::from("file3"), - last_modified: chrono::Utc.timestamp_nanos(0), - size: 0, - e_tag: None, - version: None, - }, - partition_values: vec![ScalarValue::from("2023-01-02")], - range: None, - statistics: Some(Statistics { - num_rows: Precision::Absent, - total_byte_size: Precision::Absent, - column_statistics: vec![ColumnStatistics { - min_value: Precision::Exact(ScalarValue::from(0.0)), - max_value: Precision::Exact(ScalarValue::from(1.0)), - ..Default::default() - }], - }), - extensions: None, - }], - ]; let table_schema = Arc::new(Schema::new(vec![ Field::new("value".to_string(), DataType::Float64, false), Field::new("date".to_string(), DataType::Utf8, false), ])); - use datafusion_physical_expr::expressions::Column; - let sort_order_by_value = vec![PhysicalSortExpr { - expr: Arc::new(Column::new("value", 0)), - options: SortOptions { - descending: false, - nulls_first: false, - }, + struct File { + name: &'static str, + date: &'static str, + value: Option<(f64, f64)>, + } + impl File { + fn new( + name: &'static str, + date: &'static str, + value: Option<(f64, f64)>, + ) -> Self { + Self { name, date, value } + } + } + + struct TestCase { + files: Vec, + expected_file_groups: Vec>, + sort: Vec, + } + + use datafusion_expr::col; + let cases = vec![TestCase { + files: vec![ + File::new("0", "2023-01-01", Some((0.00, 0.49))), + File::new("1", "2023-01-01", Some((0.50, 1.00))), + File::new("2", "2023-01-02", Some((0.00, 1.00))), + ], + expected_file_groups: vec![vec![0, 1], vec![2]], + sort: vec![col("value").sort(true, false)], }]; - let results = FileScanConfig::sort_file_groups( - &table_schema, - &table_schema, - &file_groups, - &sort_order_by_value, - ) - .unwrap(); + for case in cases { + let sort_order = case + .sort + .into_iter() + .map(|expr| { + crate::physical_planner::create_physical_sort_expr( + &expr, + &DFSchema::try_from(table_schema.as_ref().clone())?, + &ExecutionProps::default(), + ) + }) + .collect::>>()?; + + let partitioned_files = + case.files.into_iter().map(From::from).collect::>(); + let results = FileScanConfig::sort_file_groups( + &table_schema, + &table_schema, + &[partitioned_files.clone()], + &sort_order, + )?; + + let file_groups = results + .into_iter() + .map(|file_group| { + file_group + .iter() + .map(|file| { + partitioned_files + .iter() + .position(|f| f.object_meta == file.object_meta) + .unwrap() + }) + .collect::>() + }) + .collect::>(); - let paths = results - .iter() - .map(|file_group| { - file_group - .iter() - .map(|file| file.object_meta.location.as_ref()) - .collect_vec() - }) - .collect_vec(); - assert_eq!(paths, vec![vec!["file1", "file2"], vec!["file3"]]); + assert_eq!(file_groups, case.expected_file_groups); + } + + return Ok(()); + + impl From for PartitionedFile { + fn from(file: File) -> Self { + PartitionedFile { + object_meta: ObjectMeta { + location: Path::from(format!( + "data/date={}/{}.parquet", + file.date, file.name + )), + last_modified: chrono::Utc.timestamp_nanos(0), + size: 0, + e_tag: None, + version: None, + }, + partition_values: vec![ScalarValue::from(file.date)], + range: None, + statistics: Some(Statistics { + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: vec![ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from( + file.value.map(|v| v.0), + )), + max_value: Precision::Exact(ScalarValue::from( + file.value.map(|v| v.1), + )), + ..Default::default() + }], + }), + extensions: None, + } + } + } } // sets default for configs that play no role in projections From 9c8729a9f4d4f7a0365bb1135d860c360075a4cc Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 00:31:37 -0500 Subject: [PATCH 13/49] fix bug --- datafusion/core/src/datasource/physical_plan/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index baa74bb0dd9b..ff9eaabe83e4 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -453,8 +453,8 @@ fn get_projected_output_ordering( let mut all_orderings = vec![]; for output_ordering in &base_config.output_ordering { // Check if all file groups are sorted - if base_config.file_groups.iter().all(|group| { - if group.len() <= 1 { + if base_config.file_groups.iter().any(|group| { + if group.len() > 1 { return true; } @@ -466,12 +466,12 @@ fn get_projected_output_ordering( ) { Ok(statistics) => statistics, Err(e) => { - log::debug!("Error fetching statistics for file group: {e}"); + log::trace!("Error fetching statistics for file group: {e}"); return false; } }; - statistics.is_sorted() + !statistics.is_sorted() }) { debug!( "Skipping specified output ordering {:?}. \ From 6df983288e4a015ec0a487bbc4a8b2edac388972 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 00:43:29 -0500 Subject: [PATCH 14/49] use sort_file_groups in ListingTable --- .../core/src/datasource/listing/table.rs | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 2a2551236e1b..61f368acc799 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -628,16 +628,29 @@ impl TableProvider for ListingTable { filters: &[Expr], limit: Option, ) -> Result> { - let (partitioned_file_lists, statistics) = + let (mut partitioned_file_lists, statistics) = self.list_files_for_scan(state, filters, limit).await?; + let projected_schema = project_schema(&self.schema(), projection)?; + // if no files need to be read, return an `EmptyExec` if partitioned_file_lists.is_empty() { - let schema = self.schema(); - let projected_schema = project_schema(&schema, projection)?; return Ok(Arc::new(EmptyExec::new(projected_schema))); } + let output_ordering = self.try_create_output_ordering()?; + if let Some(new_groups) = output_ordering.first().and_then(|output_ordering| { + FileScanConfig::sort_file_groups( + &self.table_schema, + &projected_schema, + &partitioned_file_lists, + output_ordering, + ) + .ok() + }) { + partitioned_file_lists = new_groups; + } + // extract types of partition columns let table_partition_cols = self .options @@ -661,6 +674,7 @@ impl TableProvider for ListingTable { } else { return Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))); }; + // create the execution plan self.options .format @@ -673,7 +687,7 @@ impl TableProvider for ListingTable { statistics, projection: projection.cloned(), limit, - output_ordering: self.try_create_output_ordering()?, + output_ordering, table_partition_cols, }, filters.as_ref(), From f855a8a7bac53f14761c874466d71fe3c618b8e4 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 01:21:36 -0500 Subject: [PATCH 15/49] move check into a better place --- .../core/src/datasource/physical_plan/mod.rs | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index ff9eaabe83e4..417d7d127276 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -62,7 +62,7 @@ use crate::{ use arrow::{ array::new_null_array, - compute::{can_cast_types, cast}, + compute::{can_cast_types, cast, SortColumn}, datatypes::{DataType, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, }; @@ -600,7 +600,32 @@ impl MinMaxStatistics { let [min, max] = [min_values, max_values].map(|values| { let sorting_columns = sort_order .iter() - .map(|expr| expr.evaluate_to_sort_column(&values)) + .map(|sort_expr| { + let column = sort_expr + .expr + .as_any() + .downcast_ref::() + .ok_or(DataFusionError::Plan( + "sort expression must be on column".to_string(), + ))?; + + let schema = values.schema(); + + let idx = schema.index_of(column.name())?; + let field = schema.field(idx); + + // check that sort columns are non-nullable + if field.is_nullable() { + return Err(DataFusionError::Plan( + "cannot sort by nullable column".to_string(), + )); + } + + Ok(SortColumn { + values: Arc::clone(values.column(idx)), + options: Some(sort_expr.options), + }) + }) .collect::>>()?; converter.convert_columns( &sorting_columns From 3e5263b6e5cc4218faf714a950bcf42940eb3887 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 01:25:55 -0500 Subject: [PATCH 16/49] refactor test a bit --- .../datasource/physical_plan/file_scan_config.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 3b7571dcac40..0fbb6b5277f4 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -842,11 +842,6 @@ mod tests { use datafusion_expr::execution_props::ExecutionProps; use object_store::{path::Path, ObjectMeta}; - let table_schema = Arc::new(Schema::new(vec![ - Field::new("value".to_string(), DataType::Float64, false), - Field::new("date".to_string(), DataType::Utf8, false), - ])); - struct File { name: &'static str, date: &'static str, @@ -863,6 +858,7 @@ mod tests { } struct TestCase { + schema: SchemaRef, files: Vec, expected_file_groups: Vec>, sort: Vec, @@ -870,6 +866,10 @@ mod tests { use datafusion_expr::col; let cases = vec![TestCase { + schema: Arc::new(Schema::new(vec![ + Field::new("value".to_string(), DataType::Float64, false), + Field::new("date".to_string(), DataType::Utf8, false), + ])), files: vec![ File::new("0", "2023-01-01", Some((0.00, 0.49))), File::new("1", "2023-01-01", Some((0.50, 1.00))), @@ -886,7 +886,7 @@ mod tests { .map(|expr| { crate::physical_planner::create_physical_sort_expr( &expr, - &DFSchema::try_from(table_schema.as_ref().clone())?, + &DFSchema::try_from(case.schema.as_ref().clone())?, &ExecutionProps::default(), ) }) @@ -895,8 +895,8 @@ mod tests { let partitioned_files = case.files.into_iter().map(From::from).collect::>(); let results = FileScanConfig::sort_file_groups( - &table_schema, - &table_schema, + &case.schema, + &case.schema, &[partitioned_files.clone()], &sort_order, )?; From 5b7b3070ad97d927c55970b15ba8def61828d61b Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 01:49:48 -0500 Subject: [PATCH 17/49] more testing --- .../physical_plan/file_scan_config.rs | 113 +++++++++++------- 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 0fbb6b5277f4..cc5d0f72b614 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -845,48 +845,67 @@ mod tests { struct File { name: &'static str, date: &'static str, - value: Option<(f64, f64)>, + statistics: Vec>, } impl File { fn new( name: &'static str, date: &'static str, - value: Option<(f64, f64)>, + statistics: Vec>, ) -> Self { - Self { name, date, value } + Self { + name, + date, + statistics, + } } } struct TestCase { - schema: SchemaRef, + #[allow(unused)] + file_schema: SchemaRef, files: Vec, - expected_file_groups: Vec>, + expected_result: Result>, String>, sort: Vec, } use datafusion_expr::col; let cases = vec![TestCase { - schema: Arc::new(Schema::new(vec![ - Field::new("value".to_string(), DataType::Float64, false), - Field::new("date".to_string(), DataType::Utf8, false), - ])), + file_schema: Arc::new(Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )])), files: vec![ - File::new("0", "2023-01-01", Some((0.00, 0.49))), - File::new("1", "2023-01-01", Some((0.50, 1.00))), - File::new("2", "2023-01-02", Some((0.00, 1.00))), + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], - expected_file_groups: vec![vec![0, 1], vec![2]], + expected_result: Ok(vec![vec![0, 1], vec![2]]), sort: vec![col("value").sort(true, false)], }]; for case in cases { + let table_schema = Arc::new(Schema::new( + case.file_schema + .fields() + .clone() + .into_iter() + .cloned() + .chain(Some(Arc::new(Field::new( + "date".to_string(), + DataType::Utf8, + false, + )))) + .collect::>(), + )); let sort_order = case .sort .into_iter() .map(|expr| { crate::physical_planner::create_physical_sort_expr( &expr, - &DFSchema::try_from(case.schema.as_ref().clone())?, + &DFSchema::try_from(table_schema.as_ref().clone())?, &ExecutionProps::default(), ) }) @@ -895,28 +914,30 @@ mod tests { let partitioned_files = case.files.into_iter().map(From::from).collect::>(); let results = FileScanConfig::sort_file_groups( - &case.schema, - &case.schema, + &table_schema, + &table_schema, &[partitioned_files.clone()], &sort_order, - )?; - - let file_groups = results - .into_iter() - .map(|file_group| { - file_group - .iter() - .map(|file| { - partitioned_files - .iter() - .position(|f| f.object_meta == file.object_meta) - .unwrap() - }) - .collect::>() - }) - .collect::>(); + ) + .map(|file_groups| { + file_groups + .into_iter() + .map(|file_group| { + file_group + .iter() + .map(|file| { + partitioned_files + .iter() + .position(|f| f.object_meta == file.object_meta) + .unwrap() + }) + .collect::>() + }) + .collect::>() + }) + .map_err(|e| e.to_string()); - assert_eq!(file_groups, case.expected_file_groups); + assert_eq!(results, case.expected_result); } return Ok(()); @@ -939,15 +960,23 @@ mod tests { statistics: Some(Statistics { num_rows: Precision::Absent, total_byte_size: Precision::Absent, - column_statistics: vec![ColumnStatistics { - min_value: Precision::Exact(ScalarValue::from( - file.value.map(|v| v.0), - )), - max_value: Precision::Exact(ScalarValue::from( - file.value.map(|v| v.1), - )), - ..Default::default() - }], + column_statistics: file + .statistics + .into_iter() + .map(|stats| { + stats + .map(|(min, max)| ColumnStatistics { + min_value: Precision::Exact(ScalarValue::from( + min, + )), + max_value: Precision::Exact(ScalarValue::from( + max, + )), + ..Default::default() + }) + .unwrap_or_default() + }) + .collect::>(), }), extensions: None, } From 4761096e7395fcdad89aaacfacaf92ebbf9174fc Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 01:53:33 -0500 Subject: [PATCH 18/49] more testing --- .../physical_plan/file_scan_config.rs | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index cc5d0f72b614..e42b00396dab 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -863,27 +863,43 @@ mod tests { struct TestCase { #[allow(unused)] - file_schema: SchemaRef, + file_schema: Schema, files: Vec, - expected_result: Result>, String>, sort: Vec, + expected_result: Result>, &'static str>, } use datafusion_expr::col; - let cases = vec![TestCase { - file_schema: Arc::new(Schema::new(vec![Field::new( - "value".to_string(), - DataType::Float64, - false, - )])), - files: vec![ - File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), - File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), - File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), - ], - expected_result: Ok(vec![vec![0, 1], vec![2]]), - sort: vec![col("value").sort(true, false)], - }]; + let cases = vec![ + TestCase { + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec![0, 1], vec![2]]), + }, + TestCase { + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + true, // should fail because nullable + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Err(""), + }, + ]; for case in cases { let table_schema = Arc::new(Schema::new( @@ -935,7 +951,7 @@ mod tests { }) .collect::>() }) - .map_err(|e| e.to_string()); + .map_err(|e| e.to_string().leak() as &'static str); assert_eq!(results, case.expected_result); } From a95dffa2f0d81dbdee5343f30c171363d4afa21d Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 02:03:36 -0500 Subject: [PATCH 19/49] better error message --- .../physical_plan/file_scan_config.rs | 4 ++- .../core/src/datasource/physical_plan/mod.rs | 30 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index e42b00396dab..5c0738f67c1f 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -897,7 +897,9 @@ mod tests { File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], sort: vec![col("value").sort(true, false)], - expected_result: Err(""), + expected_result: Err( + "Error during planning: cannot sort by nullable column", + ), }, ]; diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 417d7d127276..891e12942d0f 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -453,8 +453,8 @@ fn get_projected_output_ordering( let mut all_orderings = vec![]; for output_ordering in &base_config.output_ordering { // Check if all file groups are sorted - if base_config.file_groups.iter().any(|group| { - if group.len() > 1 { + if base_config.file_groups.iter().all(|group| { + if group.len() <= 1 { return true; } @@ -471,7 +471,7 @@ fn get_projected_output_ordering( } }; - !statistics.is_sorted() + statistics.is_sorted() }) { debug!( "Skipping specified output ordering {:?}. \ @@ -565,8 +565,8 @@ impl MinMaxStatistics { .map(|field| { let (min, max) = get_min_max(table_schema.index_of(field.name())?); Ok(( - (field.name(), ScalarValue::iter_to_array(min)?), - (field.name(), ScalarValue::iter_to_array(max)?), + ScalarValue::iter_to_array(min)?, + ScalarValue::iter_to_array(max)?, )) }) .collect::>>()? @@ -575,8 +575,8 @@ impl MinMaxStatistics { Self::new( sort_order, - RecordBatch::try_from_iter(min_values)?, - RecordBatch::try_from_iter(max_values)?, + RecordBatch::try_new(Arc::clone(projected_schema), min_values)?, + RecordBatch::try_new(Arc::clone(projected_schema), max_values)?, ) } @@ -627,12 +627,16 @@ impl MinMaxStatistics { }) }) .collect::>>()?; - converter.convert_columns( - &sorting_columns - .into_iter() - .map(|c| c.values) - .collect::>(), - ) + converter + .convert_columns( + &sorting_columns + .into_iter() + .map(|c| c.values) + .collect::>(), + ) + .map_err(|e| { + DataFusionError::ArrowError(e, Some("convert columns".to_string())) + }) }); Ok(Self { From 1a6660486e141a992f76149eba6deb881658ec78 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 02:04:34 -0500 Subject: [PATCH 20/49] fix log msg --- datafusion/core/src/datasource/physical_plan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 891e12942d0f..329823f38555 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -475,7 +475,7 @@ fn get_projected_output_ordering( }) { debug!( "Skipping specified output ordering {:?}. \ - Some file group couldn't be determined to be sorted: {:?}", + Some file groups couldn't be determined to be sorted: {:?}", base_config.output_ordering[0], base_config.file_groups ); return vec![]; From cca5f0fbc41b5fd58d7b9f2ce1cafccafa8599c9 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Fri, 15 Mar 2024 02:24:17 -0500 Subject: [PATCH 21/49] fix again --- datafusion/core/src/datasource/physical_plan/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 329823f38555..75afb4c79e83 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -452,10 +452,11 @@ fn get_projected_output_ordering( ) -> Vec> { let mut all_orderings = vec![]; for output_ordering in &base_config.output_ordering { - // Check if all file groups are sorted - if base_config.file_groups.iter().all(|group| { + // Check if any file groups are not sorted + if base_config.file_groups.iter().any(|group| { if group.len() <= 1 { - return true; + // File groups with <= 1 files are always sorted + return false; } let statistics = match MinMaxStatistics::new_from_files( @@ -471,7 +472,7 @@ fn get_projected_output_ordering( } }; - statistics.is_sorted() + !statistics.is_sorted() }) { debug!( "Skipping specified output ordering {:?}. \ From 8f7a2d7f010e6ac142231d90f98f600e720c442d Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 21 Mar 2024 01:52:36 -0500 Subject: [PATCH 22/49] add sqllogictest and fixes --- .../core/src/datasource/listing/table.rs | 14 +++--- .../physical_plan/file_scan_config.rs | 3 +- .../core/src/datasource/physical_plan/mod.rs | 37 ++++++++++++---- .../sqllogictest/test_files/parquet.slt | 44 ++++++++++++++++--- 4 files changed, 78 insertions(+), 20 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 5def685e1600..69191288b448 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -643,17 +643,20 @@ impl TableProvider for ListingTable { } let output_ordering = self.try_create_output_ordering()?; - if let Some(new_groups) = output_ordering.first().and_then(|output_ordering| { + match output_ordering.first().map(|output_ordering| { FileScanConfig::sort_file_groups( &self.table_schema, &projected_schema, &partitioned_file_lists, output_ordering, ) - .ok() }) { - partitioned_file_lists = new_groups; - } + Some(Err(e)) => log::debug!("failed to sort file groups: {e}"), + Some(Ok(new_groups)) => { + partitioned_file_lists = new_groups; + } + None => {} // no ordering required + }; // extract types of partition columns let table_partition_cols = self @@ -838,10 +841,11 @@ impl ListingTable { // collect the statistics if required by the config let files = file_list .map(|part_file| async { - let part_file = part_file?; + let mut part_file = part_file?; if self.options.collect_stat { let statistics = self.do_collect_statistics(ctx, &store, &part_file).await?; + part_file.statistics = Some(statistics.clone()); Ok((part_file, statistics)) as Result<(PartitionedFile, Statistics)> } else { Ok((part_file, Statistics::new_unknown(&self.file_schema))) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 5c0738f67c1f..fdc4702448b5 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -230,7 +230,8 @@ impl FileScanConfig { table_schema, projected_schema, flattened_files.iter().copied(), - )?; + ) + .map_err(|e| e.context("construct min/max statistics"))?; let indices_sorted_by_min = { let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 75afb4c79e83..f18abe74260d 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -564,25 +564,42 @@ impl MinMaxStatistics { .fields() .iter() .map(|field| { - let (min, max) = get_min_max(table_schema.index_of(field.name())?); + let (min, max) = + get_min_max(table_schema.index_of(field.name()).map_err(|e| { + DataFusionError::ArrowError( + e, + Some(format!("get min/max for field: '{}'", field.name())), + ) + })?); Ok(( ScalarValue::iter_to_array(min)?, ScalarValue::iter_to_array(max)?, )) }) - .collect::>>()? + .collect::>>() + .map_err(|e| e.context("collect min/max values"))? .into_iter() .unzip(); Self::new( sort_order, - RecordBatch::try_new(Arc::clone(projected_schema), min_values)?, - RecordBatch::try_new(Arc::clone(projected_schema), max_values)?, + table_schema, + RecordBatch::try_new(Arc::clone(projected_schema), min_values).map_err( + |e| { + DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string())) + }, + )?, + RecordBatch::try_new(Arc::clone(projected_schema), max_values).map_err( + |e| { + DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string())) + }, + )?, ) } fn new( sort_order: &[PhysicalSortExpr], + table_schema: &SchemaRef, min_values: RecordBatch, max_values: RecordBatch, ) -> Result { @@ -592,10 +609,11 @@ impl MinMaxStatistics { .iter() .map(|expr| { expr.expr - .data_type(&min_values.schema()) + .data_type(table_schema) .map(|data_type| SortField::new_with_options(data_type, expr.options)) }) - .collect::>>()?; + .collect::>>() + .map_err(|e| e.context("create sort fields"))?; let converter = RowConverter::new(sort_fields)?; let [min, max] = [min_values, max_values].map(|values| { @@ -627,7 +645,8 @@ impl MinMaxStatistics { options: Some(sort_expr.options), }) }) - .collect::>>()?; + .collect::>>() + .map_err(|e| e.context("create sorting columns"))?; converter .convert_columns( &sorting_columns @@ -641,8 +660,8 @@ impl MinMaxStatistics { }); Ok(Self { - min: min?, - max: max?, + min: min.map_err(|e| e.context("build min rows"))?, + max: max.map_err(|e| e.context("build max rows"))?, }) } diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 3cc52666d533..23eb4dbd0029 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -21,6 +21,10 @@ statement ok set datafusion.execution.target_partitions = 2; +# Collect statistics -- used for sorting files +statement ok +set datafusion.execution.collect_statistics = true; + # Create a table as a data source statement ok CREATE TABLE src_table ( @@ -132,8 +136,8 @@ STORED AS PARQUET; ---- 3 -# Check output plan again, expect no "output_ordering" clause in the physical_plan -> ParquetExec, -# due to there being more files than partitions: +# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: +# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. query TT EXPLAIN SELECT int_col, string_col FROM test_table @@ -144,9 +148,7 @@ Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST --TableScan: test_table projection=[int_col, string_col] physical_plan SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] ---SortExec: expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] -----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col] - +--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] # Perform queries using MIN and MAX query I @@ -169,6 +171,38 @@ SELECT min(date_col) FROM test_table; ---- 1970-01-02 +# Clean up +statement ok +DROP TABLE test_table; + +# Do one more test, but order by numeric columns: +# This is to exercise file group sorting, which uses file-level statistics +# DataFusion doesn't currently support string column statistics +statement ok +CREATE EXTERNAL TABLE test_table ( + int_col INT NOT NULL, + string_col TEXT NOT NULL, + bigint_col BIGINT NOT NULL, + date_col DATE NOT NULL +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet/test_table'; + +# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: +# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. +query TT +EXPLAIN SELECT int_col, bigint_col +FROM test_table +ORDER BY int_col, bigint_col; +---- +logical_plan +Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST +--TableScan: test_table projection=[int_col, bigint_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] + + # Clean up statement ok DROP TABLE test_table; From e9fad54257a13ab9630a0f038fa2b5a1b26aa006 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 21 Mar 2024 01:57:41 -0500 Subject: [PATCH 23/49] fix test --- .../core/src/datasource/physical_plan/file_scan_config.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index fdc4702448b5..9a952e884b30 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -898,9 +898,7 @@ mod tests { File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], sort: vec![col("value").sort(true, false)], - expected_result: Err( - "Error during planning: cannot sort by nullable column", - ), + expected_result: Err("construct min/max statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column"), }, ]; From e982f0fb826b03c0a2cee11663fa4f2e6f7a6bac Mon Sep 17 00:00:00 2001 From: Matthew Cramerus <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 18:30:03 -0500 Subject: [PATCH 24/49] Update datafusion/core/src/datasource/listing/mod.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/datasource/listing/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index 6728dc4079e4..798e91dbb819 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -67,7 +67,10 @@ pub struct PartitionedFile { pub partition_values: Vec, /// An optional file range for a more fine-grained parallel execution pub range: Option, - /// Optional statistics that describe the data in this file + /// Optional statistics that describe the data in this file if known. + /// + /// DataFusion relies on these statistics for planning so if they are incorrect + /// incorrect answers may result. pub statistics: Option, /// An optional field for user defined per object metadata pub extensions: Option>, From cc9f1442b25604fd2dcb1d14410f6f836465d885 Mon Sep 17 00:00:00 2001 From: Matthew Cramerus <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 18:30:47 -0500 Subject: [PATCH 25/49] Update datafusion/core/src/datasource/physical_plan/file_scan_config.rs Co-authored-by: Andrew Lamb --- .../core/src/datasource/physical_plan/file_scan_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 9a952e884b30..3dcc6e571ca6 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -231,7 +231,7 @@ impl FileScanConfig { projected_schema, flattened_files.iter().copied(), ) - .map_err(|e| e.context("construct min/max statistics"))?; + .map_err(|e| e.context("construct min/max statistics for sort_file_groups"))?; let indices_sorted_by_min = { let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); From 95bb790ff2d1ded03c34a4a84b27ae164ba4e22e Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 19:37:00 -0500 Subject: [PATCH 26/49] more unit tests --- .../physical_plan/file_scan_config.rs | 158 +++++++++++++++--- .../core/src/datasource/physical_plan/mod.rs | 42 +++-- 2 files changed, 159 insertions(+), 41 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 3dcc6e571ca6..ba0dbe841ebf 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -225,6 +225,10 @@ impl FileScanConfig { // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8 // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html + if flattened_files.is_empty() { + return Ok(vec![]); + } + let statistics = MinMaxStatistics::new_from_files( sort_order, table_schema, @@ -863,16 +867,17 @@ mod tests { } struct TestCase { - #[allow(unused)] + name: &'static str, file_schema: Schema, files: Vec, sort: Vec, - expected_result: Result>, &'static str>, + expected_result: Result>, &'static str>, } use datafusion_expr::col; let cases = vec![ TestCase { + name: "test sort", file_schema: Schema::new(vec![Field::new( "value".to_string(), DataType::Float64, @@ -884,9 +889,44 @@ mod tests { File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], sort: vec![col("value").sort(true, false)], - expected_result: Ok(vec![vec![0, 1], vec![2]]), + expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), + }, + // same input but file '2' is in the middle + // test that we still order correctly + TestCase { + name: "test sort with files ordered differently", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), + }, + // FIXME: this test is broken + TestCase { + name: "reverse sort", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), + File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), + ], + sort: vec![col("value").sort(false, true)], + expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]), }, + // reject nullable sort columns TestCase { + name: "no nullable sort columns", file_schema: Schema::new(vec![Field::new( "value".to_string(), DataType::Float64, @@ -900,6 +940,62 @@ mod tests { sort: vec![col("value").sort(true, false)], expected_result: Err("construct min/max statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column"), }, + TestCase { + name: "all three non-overlapping", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]), + File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0", "1", "2"]]), + }, + TestCase { + name: "all three overlapping", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]), + }, + TestCase { + name: "empty input", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![], + sort: vec![col("value").sort(true, false)], + expected_result: Ok(vec![]), + }, + TestCase { + name: "one file missing statistics", + file_schema: Schema::new(vec![Field::new( + "value".to_string(), + DataType::Float64, + false, + )]), + files: vec![ + File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), + File::new("2", "2023-01-02", vec![None]), + ], + sort: vec![col("value").sort(true, false)], + expected_result: Err("construct min/max statistics\ncaused by\ncollect min/max values\ncaused by\nError during planning: statistics not found"), + }, ]; for case in cases { @@ -930,31 +1026,47 @@ mod tests { let partitioned_files = case.files.into_iter().map(From::from).collect::>(); - let results = FileScanConfig::sort_file_groups( + let result = FileScanConfig::sort_file_groups( &table_schema, &table_schema, &[partitioned_files.clone()], &sort_order, - ) - .map(|file_groups| { - file_groups - .into_iter() - .map(|file_group| { - file_group - .iter() - .map(|file| { - partitioned_files - .iter() - .position(|f| f.object_meta == file.object_meta) - .unwrap() - }) - .collect::>() - }) - .collect::>() - }) - .map_err(|e| e.to_string().leak() as &'static str); + ); + let results_by_name = result + .as_ref() + .map(|file_groups| { + file_groups + .iter() + .map(|file_group| { + file_group + .iter() + .map(|file| { + partitioned_files + .iter() + .find_map(|f| { + if f.object_meta == file.object_meta { + Some( + f.object_meta + .location + .as_ref() + .rsplit('/') + .next() + .unwrap() + .trim_end_matches(".parquet"), + ) + } else { + None + } + }) + .unwrap() + }) + .collect::>() + }) + .collect::>() + }) + .map_err(|e| e.to_string().leak() as &'static str); - assert_eq!(results, case.expected_result); + assert_eq!(results_by_name, case.expected_result, "{}", case.name); } return Ok(()); diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index f18abe74260d..ea37b7fe4689 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -508,13 +508,20 @@ fn get_projected_output_ordering( all_orderings } -// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +/// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +/// The min/max values are ordered by [`Self::sort_order`]. pub(crate) struct MinMaxStatistics { min: arrow::row::Rows, max: arrow::row::Rows, + sort_order: Vec, } impl MinMaxStatistics { + #[allow(unused)] + fn sort_order(&self) -> &[PhysicalSortExpr] { + &self.sort_order + } + fn new_from_files<'a>( sort_order: &[PhysicalSortExpr], table_schema: &SchemaRef, @@ -535,29 +542,27 @@ impl MinMaxStatistics { DataFusionError::Plan("Parquet file missing statistics".to_string()) })?; - let get_min_max = |i: usize| -> (Vec, Vec) { - statistics_and_partition_values + let get_min_max = |i: usize| -> Result<(Vec, Vec)> { + Ok(statistics_and_partition_values .iter() .map(|(s, pv)| { if i < s.column_statistics.len() { - ( - s.column_statistics[i] - .min_value - .get_value() - .cloned() - .unwrap_or(ScalarValue::Null), - s.column_statistics[i] - .max_value - .get_value() - .cloned() - .unwrap_or(ScalarValue::Null), - ) + s.column_statistics[i] + .min_value + .get_value() + .cloned() + .zip(s.column_statistics[i].max_value.get_value().cloned()) + .ok_or_else(|| { + DataFusionError::Plan("statistics not found".to_string()) + }) } else { let partition_value = &pv[i - s.column_statistics.len()]; - (partition_value.clone(), partition_value.clone()) + Ok((partition_value.clone(), partition_value.clone())) } }) - .unzip() + .collect::>>()? + .into_iter() + .unzip()) }; let (min_values, max_values): (Vec<_>, Vec<_>) = projected_schema @@ -570,7 +575,7 @@ impl MinMaxStatistics { e, Some(format!("get min/max for field: '{}'", field.name())), ) - })?); + })?)?; Ok(( ScalarValue::iter_to_array(min)?, ScalarValue::iter_to_array(max)?, @@ -662,6 +667,7 @@ impl MinMaxStatistics { Ok(Self { min: min.map_err(|e| e.context("build min rows"))?, max: max.map_err(|e| e.context("build max rows"))?, + sort_order: sort_order.to_vec(), }) } From 0e6023062798f8f0a547fee2638f14d8aa2ca1fa Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 19:38:09 -0500 Subject: [PATCH 27/49] rename to split_groups_by_statistics --- datafusion/core/src/datasource/listing/table.rs | 2 +- .../src/datasource/physical_plan/file_scan_config.rs | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 69191288b448..1c91de095a24 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -644,7 +644,7 @@ impl TableProvider for ListingTable { let output_ordering = self.try_create_output_ordering()?; match output_ordering.first().map(|output_ordering| { - FileScanConfig::sort_file_groups( + FileScanConfig::split_groups_by_statistics( &self.table_schema, &projected_schema, &partitioned_file_lists, diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index ba0dbe841ebf..adea7a160f8f 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -207,7 +207,7 @@ impl FileScanConfig { /// Attempts to do a bin-packing on files into file groups, such that any two files /// in a file group are ordered and non-overlapping with respect to their statistics. /// It will produce the smallest number of file groups possible. - pub fn sort_file_groups( + pub fn split_groups_by_statistics( table_schema: &SchemaRef, projected_schema: &SchemaRef, file_groups: &[Vec], @@ -235,7 +235,9 @@ impl FileScanConfig { projected_schema, flattened_files.iter().copied(), ) - .map_err(|e| e.context("construct min/max statistics for sort_file_groups"))?; + .map_err(|e| { + e.context("construct min/max statistics for split_groups_by_statistics") + })?; let indices_sorted_by_min = { let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); @@ -841,7 +843,7 @@ mod tests { } #[test] - fn test_sort_file_groups() -> Result<()> { + fn test_split_groups_by_statistics() -> Result<()> { use chrono::TimeZone; use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; @@ -1026,7 +1028,7 @@ mod tests { let partitioned_files = case.files.into_iter().map(From::from).collect::>(); - let result = FileScanConfig::sort_file_groups( + let result = FileScanConfig::split_groups_by_statistics( &table_schema, &table_schema, &[partitioned_files.clone()], From 9f375e88bf0afa0820bf18237ad5847fcb5d3bbf Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 19:43:24 -0500 Subject: [PATCH 28/49] only use groups if there's <= target_partitions --- datafusion/core/src/datasource/listing/table.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 1c91de095a24..7c2d22077c02 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -651,9 +651,13 @@ impl TableProvider for ListingTable { output_ordering, ) }) { - Some(Err(e)) => log::debug!("failed to sort file groups: {e}"), + Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"), Some(Ok(new_groups)) => { - partitioned_file_lists = new_groups; + if new_groups.len() <= self.options.target_partitions { + partitioned_file_lists = new_groups; + } else { + log::debug!("attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered") + } } None => {} // no ordering required }; From 3d9d2935e2a3f654713f3284e5793ba127c91b27 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 20:03:09 -0500 Subject: [PATCH 29/49] refactor a bit, no need for projected_schema --- .../core/src/datasource/listing/table.rs | 1 - .../physical_plan/file_scan_config.rs | 7 +-- .../core/src/datasource/physical_plan/mod.rs | 43 +++++++++++++------ 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 7c2d22077c02..62b512fc7d9f 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -646,7 +646,6 @@ impl TableProvider for ListingTable { match output_ordering.first().map(|output_ordering| { FileScanConfig::split_groups_by_statistics( &self.table_schema, - &projected_schema, &partitioned_file_lists, output_ordering, ) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index adea7a160f8f..7254129f6cf2 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -209,7 +209,6 @@ impl FileScanConfig { /// It will produce the smallest number of file groups possible. pub fn split_groups_by_statistics( table_schema: &SchemaRef, - projected_schema: &SchemaRef, file_groups: &[Vec], sort_order: &[PhysicalSortExpr], ) -> Result>> { @@ -232,7 +231,6 @@ impl FileScanConfig { let statistics = MinMaxStatistics::new_from_files( sort_order, table_schema, - projected_schema, flattened_files.iter().copied(), ) .map_err(|e| { @@ -940,7 +938,7 @@ mod tests { File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], sort: vec![col("value").sort(true, false)], - expected_result: Err("construct min/max statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column"), + expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column") }, TestCase { name: "all three non-overlapping", @@ -996,7 +994,7 @@ mod tests { File::new("2", "2023-01-02", vec![None]), ], sort: vec![col("value").sort(true, false)], - expected_result: Err("construct min/max statistics\ncaused by\ncollect min/max values\ncaused by\nError during planning: statistics not found"), + expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nError during planning: statistics not found"), }, ]; @@ -1029,7 +1027,6 @@ mod tests { let partitioned_files = case.files.into_iter().map(From::from).collect::>(); let result = FileScanConfig::split_groups_by_statistics( - &table_schema, &table_schema, &[partitioned_files.clone()], &sort_order, diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index ea37b7fe4689..1c18ddd51072 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -462,7 +462,6 @@ fn get_projected_output_ordering( let statistics = match MinMaxStatistics::new_from_files( output_ordering, table_schema, - projected_schema, group, ) { Ok(statistics) => statistics, @@ -525,7 +524,6 @@ impl MinMaxStatistics { fn new_from_files<'a>( sort_order: &[PhysicalSortExpr], table_schema: &SchemaRef, - projected_schema: &SchemaRef, files: impl IntoIterator, ) -> Result { use datafusion_common::ScalarValue; @@ -565,6 +563,15 @@ impl MinMaxStatistics { .unzip()) }; + let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or( + DataFusionError::Plan("sort expression must be on column".to_string()), + )?; + + let projected_schema = Arc::new( + table_schema + .project(&(sort_columns.iter().map(|c| c.index()).collect::>()))?, + ); + let (min_values, max_values): (Vec<_>, Vec<_>) = projected_schema .fields() .iter() @@ -589,12 +596,12 @@ impl MinMaxStatistics { Self::new( sort_order, table_schema, - RecordBatch::try_new(Arc::clone(projected_schema), min_values).map_err( + RecordBatch::try_new(Arc::clone(&projected_schema), min_values).map_err( |e| { DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string())) }, )?, - RecordBatch::try_new(Arc::clone(projected_schema), max_values).map_err( + RecordBatch::try_new(Arc::clone(&projected_schema), max_values).map_err( |e| { DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string())) }, @@ -621,18 +628,15 @@ impl MinMaxStatistics { .map_err(|e| e.context("create sort fields"))?; let converter = RowConverter::new(sort_fields)?; + let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or( + DataFusionError::Plan("sort expression must be on column".to_string()), + )?; + let [min, max] = [min_values, max_values].map(|values| { let sorting_columns = sort_order .iter() - .map(|sort_expr| { - let column = sort_expr - .expr - .as_any() - .downcast_ref::() - .ok_or(DataFusionError::Plan( - "sort expression must be on column".to_string(), - ))?; - + .zip(sort_columns.iter().copied()) + .map(|(sort_expr, column)| { let schema = values.schema(); let idx = schema.index_of(column.name())?; @@ -679,6 +683,19 @@ impl MinMaxStatistics { } } +fn sort_columns_from_physical_sort_exprs( + sort_order: &[PhysicalSortExpr], +) -> Option> { + sort_order + .iter() + .map(|expr| { + expr.expr + .as_any() + .downcast_ref::() + }) + .collect::>>() +} + /// Represents the possible outcomes of a range calculation. /// /// This enum is used to encapsulate the result of calculating the range of From 1366c997da61b61c18a7a98c67d2463540abf93a Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Sat, 30 Mar 2024 20:31:49 -0500 Subject: [PATCH 30/49] fix reverse order --- .../physical_plan/file_scan_config.rs | 6 +-- .../core/src/datasource/physical_plan/mod.rs | 44 ++++++++++++++++--- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 7254129f6cf2..f1e756dea404 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -238,7 +238,8 @@ impl FileScanConfig { })?; let indices_sorted_by_min = { - let mut sort: Vec<_> = statistics.min.iter().enumerate().collect(); + let mut sort: Vec<_> = + statistics.min_by_sort_order.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); sort }; @@ -249,7 +250,7 @@ impl FileScanConfig { let file_group_to_insert = file_groups_indices.iter_mut().find(|group| { // If our file is non-overlapping and comes _after_ the last file, // it fits in this file group. - min > statistics.max.row( + min > statistics.max_by_sort_order.row( *group .last() .expect("groups should be nonempty at construction"), @@ -908,7 +909,6 @@ mod tests { sort: vec![col("value").sort(true, false)], expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), }, - // FIXME: this test is broken TestCase { name: "reverse sort", file_schema: Schema::new(vec![Field::new( diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 1c18ddd51072..147b960102d0 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -509,9 +509,10 @@ fn get_projected_output_ordering( /// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. /// The min/max values are ordered by [`Self::sort_order`]. +/// Furthermore, any columns that are reversed in the sort order have their min/max values swapped. pub(crate) struct MinMaxStatistics { - min: arrow::row::Rows, - max: arrow::row::Rows, + min_by_sort_order: arrow::row::Rows, + max_by_sort_order: arrow::row::Rows, sort_order: Vec, } @@ -632,7 +633,36 @@ impl MinMaxStatistics { DataFusionError::Plan("sort expression must be on column".to_string()), )?; - let [min, max] = [min_values, max_values].map(|values| { + // swap min/max if they're reversed in the ordering + let (new_min_cols, new_max_cols): (Vec<_>, Vec<_>) = sort_order + .iter() + .zip(sort_columns.iter().copied()) + .map(|(sort_expr, column)| { + if sort_expr.options.descending { + max_values + .column_by_name(column.name()) + .zip(min_values.column_by_name(column.name())) + } else { + min_values + .column_by_name(column.name()) + .zip(max_values.column_by_name(column.name())) + } + .ok_or_else(|| { + DataFusionError::Plan(format!( + "missing column in MinMaxStatistics::new: '{}'", + column.name() + )) + }) + }) + .collect::>>()? + .into_iter() + .unzip(); + + let [min, max] = [new_min_cols, new_max_cols].map(|cols| { + let values = RecordBatch::try_new( + min_values.schema(), + cols.into_iter().map(Arc::clone).collect(), + )?; let sorting_columns = sort_order .iter() .zip(sort_columns.iter().copied()) @@ -669,16 +699,16 @@ impl MinMaxStatistics { }); Ok(Self { - min: min.map_err(|e| e.context("build min rows"))?, - max: max.map_err(|e| e.context("build max rows"))?, + min_by_sort_order: min.map_err(|e| e.context("build min rows"))?, + max_by_sort_order: max.map_err(|e| e.context("build max rows"))?, sort_order: sort_order.to_vec(), }) } fn is_sorted(&self) -> bool { - self.max + self.max_by_sort_order .iter() - .zip(self.min.iter().skip(1)) + .zip(self.min_by_sort_order.iter().skip(1)) .all(|(max, next_min)| max < next_min) } } From 2ef800649762cf2fecded7a03e2c9305fa75f548 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:59:51 -0500 Subject: [PATCH 31/49] save work for now --- .../sqllogictest/test_files/parquet.slt | 36 -- .../test_files/parquet_sorted_statistics.slt | 352 ++++++++++++++++++ 2 files changed, 352 insertions(+), 36 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 23eb4dbd0029..0fab94fa4dba 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -21,10 +21,6 @@ statement ok set datafusion.execution.target_partitions = 2; -# Collect statistics -- used for sorting files -statement ok -set datafusion.execution.collect_statistics = true; - # Create a table as a data source statement ok CREATE TABLE src_table ( @@ -171,38 +167,6 @@ SELECT min(date_col) FROM test_table; ---- 1970-01-02 -# Clean up -statement ok -DROP TABLE test_table; - -# Do one more test, but order by numeric columns: -# This is to exercise file group sorting, which uses file-level statistics -# DataFusion doesn't currently support string column statistics -statement ok -CREATE EXTERNAL TABLE test_table ( - int_col INT NOT NULL, - string_col TEXT NOT NULL, - bigint_col BIGINT NOT NULL, - date_col DATE NOT NULL -) -STORED AS PARQUET -WITH HEADER ROW -WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet/test_table'; - -# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: -# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. -query TT -EXPLAIN SELECT int_col, bigint_col -FROM test_table -ORDER BY int_col, bigint_col; ----- -logical_plan -Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST ---TableScan: test_table projection=[int_col, bigint_col] -physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] - - # Clean up statement ok DROP TABLE test_table; diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt new file mode 100644 index 000000000000..bd1d18f6588b --- /dev/null +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -0,0 +1,352 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TESTS FOR SORTED PARQUET FILES + +# Set 2 partitions for deterministic output plans +statement ok +set datafusion.execution.target_partitions = 2; + +# Collect statistics -- used for sorting files +statement ok +set datafusion.execution.collect_statistics = true; + +# Create a table as a data source +statement ok +CREATE TABLE src_table ( + int_col INT, + string_col TEXT, + bigint_col BIGINT, + date_col DATE +) AS VALUES +(1, 'aaa', 100, 1), +(2, 'bbb', 200, 2), +(3, 'ccc', 300, 3), +(4, 'ddd', 400, 4), +(5, 'eee', 500, 5), +(6, 'fff', 600, 6), +(7, 'ggg', 700, 7), +(8, 'hhh', 800, 8), +(9, 'iii', 900, 9); + +# Setup 2 files, i.e., as many as there are partitions: + +# File 1: +query ITID +COPY (SELECT * FROM src_table LIMIT 3) +TO 'test_files/scratch/parquet/test_table/0.parquet' +STORED AS PARQUET; +---- +3 + +# File 2: +query ITID +COPY (SELECT * FROM src_table WHERE int_col > 3 LIMIT 3) +TO 'test_files/scratch/parquet/test_table/1.parquet' +STORED AS PARQUET; +---- +3 + +# Create a table from generated parquet files, without ordering: +statement ok +CREATE EXTERNAL TABLE test_table ( + int_col INT, + string_col TEXT, + bigint_col BIGINT, + date_col DATE +) +STORED AS PARQUET +WITH HEADER ROW +LOCATION 'test_files/scratch/parquet/test_table'; + +# Basic query: +query ITID +SELECT * FROM test_table ORDER BY int_col; +---- +1 aaa 100 1970-01-02 +2 bbb 200 1970-01-03 +3 ccc 300 1970-01-04 +4 ddd 400 1970-01-05 +5 eee 500 1970-01-06 +6 fff 600 1970-01-07 + +# Check output plan, expect no "output_ordering" clause in the physical_plan -> ParquetExec: +query TT +EXPLAIN SELECT int_col, string_col +FROM test_table +ORDER BY string_col, int_col; +---- +logical_plan +Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST +--TableScan: test_table projection=[int_col, string_col] +physical_plan +SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +--SortExec: expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet]]}, projection=[int_col, string_col] + +# Tear down test_table: +statement ok +DROP TABLE test_table; + +# Create test_table again, but with ordering: +statement ok +CREATE EXTERNAL TABLE test_table ( + int_col INT, + string_col TEXT, + bigint_col BIGINT, + date_col DATE +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (string_col ASC NULLS LAST, int_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet/test_table'; + +# Check output plan, expect an "output_ordering" clause in the physical_plan -> ParquetExec: +query TT +EXPLAIN SELECT int_col, string_col +FROM test_table +ORDER BY string_col, int_col; +---- +logical_plan +Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST +--TableScan: test_table projection=[int_col, string_col] +physical_plan +SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] + +# Add another file to the directory underlying test_table +query ITID +COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) +TO 'test_files/scratch/parquet/test_table/2.parquet' +STORED AS PARQUET; +---- +3 + +# Do one more test, but order by numeric columns: +# This is to exercise file group sorting, which uses file-level statistics +# DataFusion doesn't currently support string column statistics +statement ok +CREATE EXTERNAL TABLE test_table ( + int_col INT NOT NULL, + string_col TEXT NOT NULL, + bigint_col BIGINT NOT NULL, + date_col DATE NOT NULL +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet/test_table'; + +# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: +# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. +query TT +EXPLAIN SELECT int_col, bigint_col +FROM test_table +ORDER BY int_col, bigint_col; +---- +logical_plan +Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST +--TableScan: test_table projection=[int_col, bigint_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] + + +# Clean up +statement ok +DROP TABLE test_table; + +# Setup alltypes_plain table: +statement ok +CREATE EXTERNAL TABLE alltypes_plain ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../../parquet-testing/data/alltypes_plain.parquet' + +# Test a basic query with a CAST: +query IT +SELECT id, CAST(string_col AS varchar) FROM alltypes_plain +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + +# Clean up +statement ok +DROP TABLE alltypes_plain; + +# Perform SELECT on table with fixed sized binary columns + +statement ok +CREATE EXTERNAL TABLE test_binary +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../core/tests/data/test_binary.parquet'; + +# Check size of table: +query I +SELECT count(ids) FROM test_binary; +---- +466 + +# Do the SELECT query: +query ? +SELECT ids FROM test_binary ORDER BY ids LIMIT 10; +---- +008c7196f68089ab692e4739c5fd16b5 +00a51a7bc5ff8eb1627f8f3dc959dce8 +0166ce1d46129ad104fa4990c6057c91 +03a4893f3285b422820b4cd74c9b9786 +04999ac861e14682cd339eae2cc74359 +04b86bf8f228739fde391f850636a77d +050fb9cf722a709eb94b70b3ee7dc342 +052578a65e8e91b8526b182d40e846e8 +05408e6a403e4296526006e20cc4a45a +0592e6fb7d7169b888a4029b53abb701 + +# Clean up +statement ok +DROP TABLE test_binary; + +# Perform a query with a window function and timestamp data: + +statement ok +CREATE EXTERNAL TABLE timestamp_with_tz +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../core/tests/data/timestamp_with_tz.parquet'; + +# Check size of table: +query I +SELECT COUNT(*) FROM timestamp_with_tz; +---- +131072 + +# Perform the query: +query IPT +SELECT + count, + LAG(timestamp, 1) OVER (ORDER BY timestamp), + arrow_typeof(LAG(timestamp, 1) OVER (ORDER BY timestamp)) +FROM timestamp_with_tz +LIMIT 10; +---- +0 NULL Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +4 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +14 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) + +# Test config listing_table_ignore_subdirectory: + +query ITID +COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) +TO 'test_files/scratch/parquet/test_table/subdir/3.parquet' +STORED AS PARQUET; +---- +3 + +statement ok +CREATE EXTERNAL TABLE listing_table +STORED AS PARQUET +WITH HEADER ROW +LOCATION 'test_files/scratch/parquet/test_table/*.parquet'; + +statement ok +set datafusion.execution.listing_table_ignore_subdirectory = true; + +# scan file: 0.parquet 1.parquet 2.parquet +query I +select count(*) from listing_table; +---- +9 + +statement ok +set datafusion.execution.listing_table_ignore_subdirectory = false; + +# scan file: 0.parquet 1.parquet 2.parquet 3.parquet +query I +select count(*) from listing_table; +---- +12 + +# Clean up +statement ok +DROP TABLE timestamp_with_tz; + +# Test a query from the single_nan data set: +statement ok +CREATE EXTERNAL TABLE single_nan +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../../parquet-testing/data/single_nan.parquet'; + +# Check table size: +query I +SELECT COUNT(*) FROM single_nan; +---- +1 + +# Query for the single NULL: +query R +SELECT mycol FROM single_nan; +---- +NULL + +# Clean up +statement ok +DROP TABLE single_nan; + +statement ok +CREATE EXTERNAL TABLE list_columns +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../../parquet-testing/data/list_columns.parquet'; + +query ?? +SELECT int64_list, utf8_list FROM list_columns +---- +[1, 2, 3] [abc, efg, hij] +[, 1] NULL +[4] [efg, , hij, xyz] + +statement ok +DROP TABLE list_columns; + +# Clean up +statement ok +DROP TABLE listing_table; From 0153acfb060b9ee2d4c03c8d783e7a5df39eaf9b Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 01:29:24 -0500 Subject: [PATCH 32/49] lots of test cases in new slt --- .../physical_plan/file_scan_config.rs | 15 +- .../core/src/datasource/physical_plan/mod.rs | 113 ++--- .../test_files/parquet_sorted_statistics.slt | 423 +++++++----------- 3 files changed, 241 insertions(+), 310 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index cb7c16e842d6..1a5055df793a 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -142,15 +142,11 @@ impl FileScanConfig { Schema::new(table_fields).with_metadata(self.file_schema.metadata().clone()), ); - let full_table_schema = { - let mut all_table_fields: Vec<_> = - self.file_schema.all_fields().into_iter().cloned().collect(); - all_table_fields.extend(self.table_partition_cols.clone()); - Arc::new(Schema::new(all_table_fields)) - }; - - let projected_output_ordering = - get_projected_output_ordering(self, &projected_schema, &full_table_schema); + let projected_output_ordering = get_projected_output_ordering( + self, + &projected_schema, + self.projection.as_deref(), + ); (projected_schema, table_stats, projected_output_ordering) } @@ -231,6 +227,7 @@ impl FileScanConfig { let statistics = MinMaxStatistics::new_from_files( sort_order, table_schema, + None, flattened_files.iter().copied(), ) .map_err(|e| { diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 147b960102d0..17ad3dfcdb91 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -448,10 +448,34 @@ impl From for FileMeta { fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, - table_schema: &SchemaRef, + projection: Option<&[usize]>, ) -> Vec> { let mut all_orderings = vec![]; - for output_ordering in &base_config.output_ordering { + 'orderings: for output_ordering in &base_config.output_ordering { + let mut new_ordering = vec![]; + for PhysicalSortExpr { expr, options } in output_ordering { + if let Some(col) = expr.as_any().downcast_ref::() { + let name = col.name(); + if let Some((idx, _)) = projected_schema.column_with_name(name) { + // Compute the new sort expression (with correct index) after projection: + new_ordering.push(PhysicalSortExpr { + expr: Arc::new(Column::new(name, idx)), + options: *options, + }); + continue; + } + } + // Cannot find expression in the projected_schema, stop iterating + // since rest of the orderings are violated + continue 'orderings; + } + + // do not push empty entries + // otherwise we may have `Some(vec![])` at the output ordering. + if new_ordering.is_empty() { + continue; + } + // Check if any file groups are not sorted if base_config.file_groups.iter().any(|group| { if group.len() <= 1 { @@ -460,8 +484,9 @@ fn get_projected_output_ordering( } let statistics = match MinMaxStatistics::new_from_files( - output_ordering, - table_schema, + &new_ordering, + projected_schema, + projection, group, ) { Ok(statistics) => statistics, @@ -478,31 +503,10 @@ fn get_projected_output_ordering( Some file groups couldn't be determined to be sorted: {:?}", base_config.output_ordering[0], base_config.file_groups ); - return vec![]; + continue; } - let mut new_ordering = vec![]; - for PhysicalSortExpr { expr, options } in output_ordering { - if let Some(col) = expr.as_any().downcast_ref::() { - let name = col.name(); - if let Some((idx, _)) = projected_schema.column_with_name(name) { - // Compute the new sort expression (with correct index) after projection: - new_ordering.push(PhysicalSortExpr { - expr: Arc::new(Column::new(name, idx)), - options: *options, - }); - continue; - } - } - // Cannot find expression in the projected_schema, stop iterating - // since rest of the orderings are violated - break; - } - // do not push empty entries - // otherwise we may have `Some(vec![])` at the output ordering. - if !new_ordering.is_empty() { - all_orderings.push(new_ordering); - } + all_orderings.push(new_ordering); } all_orderings } @@ -523,8 +527,9 @@ impl MinMaxStatistics { } fn new_from_files<'a>( - sort_order: &[PhysicalSortExpr], - table_schema: &SchemaRef, + projected_sort_order: &[PhysicalSortExpr], + projected_schema: &SchemaRef, + projection: Option<&[usize]>, files: impl IntoIterator, ) -> Result { use datafusion_common::ScalarValue; @@ -541,6 +546,7 @@ impl MinMaxStatistics { DataFusionError::Plan("Parquet file missing statistics".to_string()) })?; + // Helper function to get min/max statistics for a given column of projected_schema let get_min_max = |i: usize| -> Result<(Vec, Vec)> { Ok(statistics_and_partition_values .iter() @@ -564,26 +570,33 @@ impl MinMaxStatistics { .unzip()) }; - let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or( - DataFusionError::Plan("sort expression must be on column".to_string()), - )?; + let sort_columns = sort_columns_from_physical_sort_exprs(projected_sort_order) + .ok_or(DataFusionError::Plan( + "sort expression must be on column".to_string(), + ))?; - let projected_schema = Arc::new( - table_schema + // Project the schema & sort order down to just the relevant columns + let min_max_schema = Arc::new( + projected_schema .project(&(sort_columns.iter().map(|c| c.index()).collect::>()))?, ); + let min_max_sort_order = sort_columns + .iter() + .zip(projected_sort_order.iter()) + .enumerate() + .map(|(i, (col, sort))| PhysicalSortExpr { + expr: Arc::new(Column::new(col.name(), i)), + options: sort.options, + }) + .collect::>(); - let (min_values, max_values): (Vec<_>, Vec<_>) = projected_schema - .fields() + let (min_values, max_values): (Vec<_>, Vec<_>) = sort_columns .iter() - .map(|field| { - let (min, max) = - get_min_max(table_schema.index_of(field.name()).map_err(|e| { - DataFusionError::ArrowError( - e, - Some(format!("get min/max for field: '{}'", field.name())), - ) - })?)?; + .map(|c| { + let i = projection.map(|p| p[c.index()]).unwrap_or(c.index()); + let (min, max) = get_min_max(i).map_err(|e| { + e.context(format!("get min/max for column: '{}'", c.name())) + })?; Ok(( ScalarValue::iter_to_array(min)?, ScalarValue::iter_to_array(max)?, @@ -595,14 +608,14 @@ impl MinMaxStatistics { .unzip(); Self::new( - sort_order, - table_schema, - RecordBatch::try_new(Arc::clone(&projected_schema), min_values).map_err( + &min_max_sort_order, + &min_max_schema, + RecordBatch::try_new(Arc::clone(&min_max_schema), min_values).map_err( |e| { DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string())) }, )?, - RecordBatch::try_new(Arc::clone(&projected_schema), max_values).map_err( + RecordBatch::try_new(Arc::clone(&min_max_schema), max_values).map_err( |e| { DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string())) }, @@ -612,7 +625,7 @@ impl MinMaxStatistics { fn new( sort_order: &[PhysicalSortExpr], - table_schema: &SchemaRef, + schema: &SchemaRef, min_values: RecordBatch, max_values: RecordBatch, ) -> Result { @@ -622,7 +635,7 @@ impl MinMaxStatistics { .iter() .map(|expr| { expr.expr - .data_type(table_schema) + .data_type(schema) .map(|data_type| SortField::new_with_options(data_type, expr.options)) }) .collect::>>() diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index bd1d18f6588b..a4a98a61df02 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -29,324 +29,245 @@ set datafusion.execution.collect_statistics = true; statement ok CREATE TABLE src_table ( int_col INT, + descending_col INT, string_col TEXT, bigint_col BIGINT, - date_col DATE + date_col DATE, + overlapping_col INT, + constant_col INT ) AS VALUES -(1, 'aaa', 100, 1), -(2, 'bbb', 200, 2), -(3, 'ccc', 300, 3), -(4, 'ddd', 400, 4), -(5, 'eee', 500, 5), -(6, 'fff', 600, 6), -(7, 'ggg', 700, 7), -(8, 'hhh', 800, 8), -(9, 'iii', 900, 9); - -# Setup 2 files, i.e., as many as there are partitions: +-- first file +(1, 3, 'aaa', 100, 1, 0, 0), +(2, 2, 'bbb', 200, 2, 1, 0), +(3, 1, 'ccc', 300, 3, 2, 0), +-- second file +(4, 6, 'ddd', 400, 4, 0, 0), +(5, 5, 'eee', 500, 5, 1, 0), +(6, 4, 'fff', 600, 6, 2, 0), +-- third file +(7, 9, 'ggg', 700, 7, 3, 0), +(8, 8, 'hhh', 800, 8, 4, 0), +(9, 7, 'iii', 900, 9, 5, 0); + +# Setup 3 files, in particular more files than there are partitions # File 1: -query ITID -COPY (SELECT * FROM src_table LIMIT 3) -TO 'test_files/scratch/parquet/test_table/0.parquet' +query IITIDII +COPY (SELECT * FROM src_table ORDER BY int_col LIMIT 3) +TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet' STORED AS PARQUET; ---- 3 # File 2: -query ITID -COPY (SELECT * FROM src_table WHERE int_col > 3 LIMIT 3) -TO 'test_files/scratch/parquet/test_table/1.parquet' +query IITIDII +COPY (SELECT * FROM src_table WHERE int_col > 3 ORDER BY int_col LIMIT 3) +TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet' STORED AS PARQUET; ---- 3 -# Create a table from generated parquet files, without ordering: +# Add another file to the directory underlying test_table +query IITIDII +COPY (SELECT * FROM src_table WHERE int_col > 6 ORDER BY int_col LIMIT 3) +TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet' +STORED AS PARQUET; +---- +3 + + +# Create a table from generated parquet files: statement ok CREATE EXTERNAL TABLE test_table ( - int_col INT, - string_col TEXT, - bigint_col BIGINT, - date_col DATE + partition_col TEXT NOT NULL, + int_col INT NOT NULL, + descending_col INT NOT NULL, + string_col TEXT NOT NULL, + bigint_col BIGINT NOT NULL, + date_col DATE NOT NULL, + overlapping_col INT NOT NULL, + constant_col INT NOT NULL ) STORED AS PARQUET -WITH HEADER ROW -LOCATION 'test_files/scratch/parquet/test_table'; +PARTITIONED BY (partition_col) +WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table'; -# Basic query: -query ITID -SELECT * FROM test_table ORDER BY int_col; +# Order by numeric columns +# This is to exercise file group sorting, which uses file-level statistics +# DataFusion doesn't currently support string column statistics +# This should not require a sort. +query TT +EXPLAIN SELECT int_col, bigint_col +FROM test_table +ORDER BY int_col, bigint_col; ---- -1 aaa 100 1970-01-02 -2 bbb 200 1970-01-03 -3 ccc 300 1970-01-04 -4 ddd 400 1970-01-05 -5 eee 500 1970-01-06 -6 fff 600 1970-01-07 +logical_plan +01)Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST +02)--TableScan: test_table projection=[int_col, bigint_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] -# Check output plan, expect no "output_ordering" clause in the physical_plan -> ParquetExec: +# Same query but check that the output is actually ordered: +query IITIDIIT +SELECT * FROM test_table ORDER BY int_col, bigint_col; +---- +1 3 aaa 100 1970-01-02 0 0 A +2 2 bbb 200 1970-01-03 1 0 A +3 1 ccc 300 1970-01-04 2 0 A +4 6 ddd 400 1970-01-05 0 0 B +5 5 eee 500 1970-01-06 1 0 B +6 4 fff 600 1970-01-07 2 0 B +7 9 ggg 700 1970-01-08 3 0 C +8 8 hhh 800 1970-01-09 4 0 C +9 7 iii 900 1970-01-10 5 0 C + + +# Another planning test, but project on a column with unsupported statistics +# We should be able to ignore this and look at only the relevant statistics query TT -EXPLAIN SELECT int_col, string_col +EXPLAIN SELECT string_col FROM test_table -ORDER BY string_col, int_col; +ORDER BY int_col, bigint_col; ---- logical_plan -Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST ---TableScan: test_table projection=[int_col, string_col] +01)Projection: test_table.string_col +02)--Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST +03)----Projection: test_table.string_col, test_table.int_col, test_table.bigint_col +04)------TableScan: test_table projection=[int_col, string_col, bigint_col] physical_plan -SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] ---SortExec: expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] -----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet]]}, projection=[int_col, string_col] +01)ProjectionExec: expr=[string_col@1 as string_col] +02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[int_col, string_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@2 ASC NULLS LAST] -# Tear down test_table: +# Clean up & recreate but sort on descending column statement ok DROP TABLE test_table; -# Create test_table again, but with ordering: statement ok CREATE EXTERNAL TABLE test_table ( - int_col INT, - string_col TEXT, - bigint_col BIGINT, - date_col DATE + partition_col TEXT NOT NULL, + int_col INT NOT NULL, + descending_col INT NOT NULL, + string_col TEXT NOT NULL, + bigint_col BIGINT NOT NULL, + date_col DATE NOT NULL, + overlapping_col INT NOT NULL, + constant_col INT NOT NULL ) STORED AS PARQUET -WITH HEADER ROW -WITH ORDER (string_col ASC NULLS LAST, int_col ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet/test_table'; +PARTITIONED BY (partition_col) +WITH ORDER (descending_col DESC NULLS LAST, bigint_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table'; -# Check output plan, expect an "output_ordering" clause in the physical_plan -> ParquetExec: +# Query order by descending_col +# This should order the files like [C, B, A] query TT -EXPLAIN SELECT int_col, string_col +EXPLAIN SELECT descending_col, bigint_col FROM test_table -ORDER BY string_col, int_col; +ORDER BY descending_col DESC NULLS LAST, bigint_col ASC NULLS LAST; ---- logical_plan -Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST ---TableScan: test_table projection=[int_col, string_col] -physical_plan -SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] ---ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] +01)Sort: test_table.descending_col DESC NULLS LAST, test_table.bigint_col ASC NULLS LAST +02)--TableScan: test_table projection=[descending_col, bigint_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet]]}, projection=[descending_col, bigint_col], output_ordering=[descending_col@0 DESC NULLS LAST, bigint_col@1 ASC NULLS LAST] -# Add another file to the directory underlying test_table -query ITID -COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) -TO 'test_files/scratch/parquet/test_table/2.parquet' -STORED AS PARQUET; ----- -3 +# Clean up & re-create with partition columns in sort order +statement ok +DROP TABLE test_table; -# Do one more test, but order by numeric columns: -# This is to exercise file group sorting, which uses file-level statistics -# DataFusion doesn't currently support string column statistics statement ok CREATE EXTERNAL TABLE test_table ( + partition_col TEXT NOT NULL, int_col INT NOT NULL, + descending_col INT NOT NULL, string_col TEXT NOT NULL, bigint_col BIGINT NOT NULL, - date_col DATE NOT NULL + date_col DATE NOT NULL, + overlapping_col INT NOT NULL, + constant_col INT NOT NULL ) STORED AS PARQUET -WITH HEADER ROW -WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet/test_table'; - -# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: -# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. +PARTITIONED BY (partition_col) +WITH ORDER (partition_col ASC NULLS LAST, int_col ASC NULLS LAST, bigint_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table'; + +# Order with partition column first +# In particular, the partition column is a string +# Even though statistics for string columns are not supported, +# string partition columns are common and we do support sorting file groups on them query TT -EXPLAIN SELECT int_col, bigint_col +EXPLAIN SELECT int_col, bigint_col, partition_col FROM test_table -ORDER BY int_col, bigint_col; +ORDER BY partition_col, int_col, bigint_col; ---- logical_plan -Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST ---TableScan: test_table projection=[int_col, bigint_col] -physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] - +01)Sort: test_table.partition_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST +02)--TableScan: test_table projection=[int_col, bigint_col, partition_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[int_col, bigint_col, partition_col], output_ordering=[partition_col@2 ASC NULLS LAST, int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] -# Clean up +# Clean up & re-create with overlapping column in sort order +# This will test the ability to sort files with overlapping statistics statement ok DROP TABLE test_table; -# Setup alltypes_plain table: statement ok -CREATE EXTERNAL TABLE alltypes_plain ( - id INT NOT NULL, - bool_col BOOLEAN NOT NULL, - tinyint_col TINYINT NOT NULL, - smallint_col SMALLINT NOT NULL, +CREATE EXTERNAL TABLE test_table ( + partition_col TEXT NOT NULL, int_col INT NOT NULL, + descending_col INT NOT NULL, + string_col TEXT NOT NULL, bigint_col BIGINT NOT NULL, - float_col FLOAT NOT NULL, - double_col DOUBLE NOT NULL, - date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, - timestamp_col TIMESTAMP NOT NULL, + date_col DATE NOT NULL, + overlapping_col INT NOT NULL, + constant_col INT NOT NULL ) STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/alltypes_plain.parquet' - -# Test a basic query with a CAST: -query IT -SELECT id, CAST(string_col AS varchar) FROM alltypes_plain ----- -4 0 -5 1 -6 0 -7 1 -2 0 -3 1 -0 0 -1 1 - -# Clean up -statement ok -DROP TABLE alltypes_plain; - -# Perform SELECT on table with fixed sized binary columns - -statement ok -CREATE EXTERNAL TABLE test_binary -STORED AS PARQUET -WITH HEADER ROW -LOCATION '../core/tests/data/test_binary.parquet'; - -# Check size of table: -query I -SELECT count(ids) FROM test_binary; ----- -466 - -# Do the SELECT query: -query ? -SELECT ids FROM test_binary ORDER BY ids LIMIT 10; ----- -008c7196f68089ab692e4739c5fd16b5 -00a51a7bc5ff8eb1627f8f3dc959dce8 -0166ce1d46129ad104fa4990c6057c91 -03a4893f3285b422820b4cd74c9b9786 -04999ac861e14682cd339eae2cc74359 -04b86bf8f228739fde391f850636a77d -050fb9cf722a709eb94b70b3ee7dc342 -052578a65e8e91b8526b182d40e846e8 -05408e6a403e4296526006e20cc4a45a -0592e6fb7d7169b888a4029b53abb701 - -# Clean up -statement ok -DROP TABLE test_binary; - -# Perform a query with a window function and timestamp data: - -statement ok -CREATE EXTERNAL TABLE timestamp_with_tz -STORED AS PARQUET -WITH HEADER ROW -LOCATION '../core/tests/data/timestamp_with_tz.parquet'; - -# Check size of table: -query I -SELECT COUNT(*) FROM timestamp_with_tz; ----- -131072 - -# Perform the query: -query IPT -SELECT - count, - LAG(timestamp, 1) OVER (ORDER BY timestamp), - arrow_typeof(LAG(timestamp, 1) OVER (ORDER BY timestamp)) -FROM timestamp_with_tz -LIMIT 10; ----- -0 NULL Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -4 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -14 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) - -# Test config listing_table_ignore_subdirectory: - -query ITID -COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) -TO 'test_files/scratch/parquet/test_table/subdir/3.parquet' -STORED AS PARQUET; ----- -3 - -statement ok -CREATE EXTERNAL TABLE listing_table -STORED AS PARQUET -WITH HEADER ROW -LOCATION 'test_files/scratch/parquet/test_table/*.parquet'; - -statement ok -set datafusion.execution.listing_table_ignore_subdirectory = true; - -# scan file: 0.parquet 1.parquet 2.parquet -query I -select count(*) from listing_table; ----- -9 - -statement ok -set datafusion.execution.listing_table_ignore_subdirectory = false; - -# scan file: 0.parquet 1.parquet 2.parquet 3.parquet -query I -select count(*) from listing_table; ----- -12 - -# Clean up -statement ok -DROP TABLE timestamp_with_tz; - -# Test a query from the single_nan data set: -statement ok -CREATE EXTERNAL TABLE single_nan -STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/single_nan.parquet'; +PARTITIONED BY (partition_col) +WITH ORDER (overlapping_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table'; -# Check table size: -query I -SELECT COUNT(*) FROM single_nan; ----- -1 - -# Query for the single NULL: -query R -SELECT mycol FROM single_nan; +query TT +EXPLAIN SELECT int_col, bigint_col, overlapping_col +FROM test_table +ORDER BY overlapping_col; ---- -NULL +logical_plan +01)Sort: test_table.overlapping_col ASC NULLS LAST +02)--TableScan: test_table projection=[int_col, bigint_col, overlapping_col] +physical_plan +01)SortPreservingMergeExec: [overlapping_col@2 ASC NULLS LAST] +02)--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet]]}, projection=[int_col, bigint_col, overlapping_col], output_ordering=[overlapping_col@2 ASC NULLS LAST] -# Clean up +# Clean up & re-create with constant column in sort order +# This will require a sort because the # of required file groups (3) +# exceeds the # of target partitions (2) statement ok -DROP TABLE single_nan; +DROP TABLE test_table; statement ok -CREATE EXTERNAL TABLE list_columns +CREATE EXTERNAL TABLE test_table ( + partition_col TEXT NOT NULL, + int_col INT NOT NULL, + descending_col INT NOT NULL, + string_col TEXT NOT NULL, + bigint_col BIGINT NOT NULL, + date_col DATE NOT NULL, + overlapping_col INT NOT NULL, + constant_col INT NOT NULL +) STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/list_columns.parquet'; +PARTITIONED BY (partition_col) +WITH ORDER (constant_col ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table'; -query ?? -SELECT int64_list, utf8_list FROM list_columns +query TT +EXPLAIN SELECT constant_col +FROM test_table +ORDER BY constant_col; ---- -[1, 2, 3] [abc, efg, hij] -[, 1] NULL -[4] [efg, , hij, xyz] - -statement ok -DROP TABLE list_columns; - -# Clean up -statement ok -DROP TABLE listing_table; +logical_plan +01)Sort: test_table.constant_col ASC NULLS LAST +02)--TableScan: test_table projection=[constant_col] +physical_plan +01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] +02)--SortExec: expr=[constant_col@0 ASC NULLS LAST] +03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] From 4e0352870dccb1372f69000e591b943b60f6a427 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 01:39:21 -0500 Subject: [PATCH 33/49] remove output check --- .../test_files/parquet_sorted_statistics.slt | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index a4a98a61df02..93a3a4e56a22 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -107,21 +107,6 @@ logical_plan 02)--TableScan: test_table projection=[int_col, bigint_col] physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST] -# Same query but check that the output is actually ordered: -query IITIDIIT -SELECT * FROM test_table ORDER BY int_col, bigint_col; ----- -1 3 aaa 100 1970-01-02 0 0 A -2 2 bbb 200 1970-01-03 1 0 A -3 1 ccc 300 1970-01-04 2 0 A -4 6 ddd 400 1970-01-05 0 0 B -5 5 eee 500 1970-01-06 1 0 B -6 4 fff 600 1970-01-07 2 0 B -7 9 ggg 700 1970-01-08 3 0 C -8 8 hhh 800 1970-01-09 4 0 C -9 7 iii 900 1970-01-10 5 0 C - - # Another planning test, but project on a column with unsupported statistics # We should be able to ignore this and look at only the relevant statistics query TT From 695e6743f0e9ba74c8e8b7834a7cce19424201e2 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 02:10:36 -0500 Subject: [PATCH 34/49] fix --- datafusion/core/src/datasource/physical_plan/mod.rs | 4 ++-- datafusion/sqllogictest/test_files/parquet.slt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 17ad3dfcdb91..691b86836026 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -451,7 +451,7 @@ fn get_projected_output_ordering( projection: Option<&[usize]>, ) -> Vec> { let mut all_orderings = vec![]; - 'orderings: for output_ordering in &base_config.output_ordering { + for output_ordering in &base_config.output_ordering { let mut new_ordering = vec![]; for PhysicalSortExpr { expr, options } in output_ordering { if let Some(col) = expr.as_any().downcast_ref::() { @@ -467,7 +467,7 @@ fn get_projected_output_ordering( } // Cannot find expression in the projected_schema, stop iterating // since rest of the orderings are violated - continue 'orderings; + break; } // do not push empty entries diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 5f8cd531c675..a83fc0274a57 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -143,8 +143,8 @@ logical_plan 01)Sort: test_table.string_col ASC NULLS LAST, test_table.int_col ASC NULLS LAST 02)--TableScan: test_table projection=[int_col, string_col] physical_plan -SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] ---ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] +01)SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +02)--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] # Perform queries using MIN and MAX query I From ec4282b3617d8c3ed921b367885cfdf37abe592c Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 02:26:13 -0500 Subject: [PATCH 35/49] fix last test --- .../core/src/datasource/physical_plan/file_scan_config.rs | 2 +- datafusion/core/src/datasource/physical_plan/mod.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 1a5055df793a..365e4658b0f1 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -999,7 +999,7 @@ mod tests { File::new("2", "2023-01-02", vec![None]), ], sort: vec![col("value").sort(true, false)], - expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nError during planning: statistics not found"), + expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found"), }, ]; diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 691b86836026..f50493d8c856 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -593,7 +593,11 @@ impl MinMaxStatistics { let (min_values, max_values): (Vec<_>, Vec<_>) = sort_columns .iter() .map(|c| { + // Reverse the projection to get the index of the column in the full statistics + // The file statistics contains _every_ column , but the sort column's index() + // refers to the index in projected_schema let i = projection.map(|p| p[c.index()]).unwrap_or(c.index()); + let (min, max) = get_min_max(i).map_err(|e| { e.context(format!("get min/max for column: '{}'", c.name())) })?; From 1030b30b24e17b8d0792208a8c150b2f14130c97 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 02:31:43 -0500 Subject: [PATCH 36/49] comment on params --- datafusion/core/src/datasource/physical_plan/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index f50493d8c856..94e972773609 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -527,9 +527,9 @@ impl MinMaxStatistics { } fn new_from_files<'a>( - projected_sort_order: &[PhysicalSortExpr], - projected_schema: &SchemaRef, - projection: Option<&[usize]>, + projected_sort_order: &[PhysicalSortExpr], // Sort order with respect to projected schema + projected_schema: &SchemaRef, // Projected schema + projection: Option<&[usize]>, // Indices of projection in full table schema (None = all columns) files: impl IntoIterator, ) -> Result { use datafusion_common::ScalarValue; From 2f34684e85df9ad1ec04a153a59cf572029db390 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 02:43:09 -0500 Subject: [PATCH 37/49] clippy --- datafusion/core/src/datasource/physical_plan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 94e972773609..32283ef3f281 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -678,7 +678,7 @@ impl MinMaxStatistics { let [min, max] = [new_min_cols, new_max_cols].map(|cols| { let values = RecordBatch::try_new( min_values.schema(), - cols.into_iter().map(Arc::clone).collect(), + cols.into_iter().cloned().collect(), )?; let sorting_columns = sort_order .iter() From 24c0bc5c9dfb932ca424c4a695ceecda82b57466 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 02:52:10 -0500 Subject: [PATCH 38/49] revert parquet.slt --- datafusion/core/src/datasource/physical_plan/mod.rs | 3 ++- datafusion/sqllogictest/test_files/parquet.slt | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 32283ef3f281..141d045b65a2 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -492,7 +492,8 @@ fn get_projected_output_ordering( Ok(statistics) => statistics, Err(e) => { log::trace!("Error fetching statistics for file group: {e}"); - return false; + // we can't prove that it's ordered, so we have to reject it + return true; } }; diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index a83fc0274a57..1b25406d8172 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -132,8 +132,8 @@ STORED AS PARQUET; ---- 3 -# Check output plan again, expect an "output_ordering" clause in the physical_plan -> ParquetExec: -# After https://github.com/apache/arrow-datafusion/pull/9593 this should not require a sort. +# Check output plan again, expect no "output_ordering" clause in the physical_plan -> ParquetExec, +# due to there being more files than partitions: query TT EXPLAIN SELECT int_col, string_col FROM test_table @@ -144,7 +144,9 @@ logical_plan 02)--TableScan: test_table projection=[int_col, string_col] physical_plan 01)SortPreservingMergeExec: [string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] -02)--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] +02)--SortExec: expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col] + # Perform queries using MIN and MAX query I From 61f883ff558fa26d4719d0e79fde268e8fc17492 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Thu, 25 Apr 2024 23:29:49 -0500 Subject: [PATCH 39/49] no need to pass projection separately --- .../core/src/datasource/physical_plan/file_scan_config.rs | 7 ++----- datafusion/core/src/datasource/physical_plan/mod.rs | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 365e4658b0f1..d19f240f21bd 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -142,11 +142,8 @@ impl FileScanConfig { Schema::new(table_fields).with_metadata(self.file_schema.metadata().clone()), ); - let projected_output_ordering = get_projected_output_ordering( - self, - &projected_schema, - self.projection.as_deref(), - ); + let projected_output_ordering = + get_projected_output_ordering(self, &projected_schema); (projected_schema, table_stats, projected_output_ordering) } diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 141d045b65a2..5e6e02e2b290 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -448,7 +448,6 @@ impl From for FileMeta { fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, - projection: Option<&[usize]>, ) -> Vec> { let mut all_orderings = vec![]; for output_ordering in &base_config.output_ordering { @@ -486,7 +485,7 @@ fn get_projected_output_ordering( let statistics = match MinMaxStatistics::new_from_files( &new_ordering, projected_schema, - projection, + base_config.projection.as_deref(), group, ) { Ok(statistics) => statistics, From 9bc29cf7b4e5838a084ba41b4cd766c77e6541a7 Mon Sep 17 00:00:00 2001 From: Matthew Cramerus <8771538+suremarc@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:28:14 -0500 Subject: [PATCH 40/49] Update datafusion/core/src/datasource/listing/mod.rs Co-authored-by: Nga Tran --- datafusion/core/src/datasource/listing/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index 798e91dbb819..73dc6ae0ba61 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -69,7 +69,7 @@ pub struct PartitionedFile { pub range: Option, /// Optional statistics that describe the data in this file if known. /// - /// DataFusion relies on these statistics for planning so if they are incorrect + /// DataFusion relies on these statistics for planning so if they are incorrect, /// incorrect answers may result. pub statistics: Option, /// An optional field for user defined per object metadata From aa8943323618e5761cd0cd361839c007a4572a81 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:30:02 -0500 Subject: [PATCH 41/49] update comment on in --- datafusion/core/src/datasource/listing/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index 73dc6ae0ba61..d0361d7b32c1 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -69,8 +69,8 @@ pub struct PartitionedFile { pub range: Option, /// Optional statistics that describe the data in this file if known. /// - /// DataFusion relies on these statistics for planning so if they are incorrect, - /// incorrect answers may result. + /// DataFusion relies on these statistics for planning (in particular to sort file groups), + /// so if they are incorrect, incorrect answers may result. pub statistics: Option, /// An optional field for user defined per object metadata pub extensions: Option>, From d7fc78a47bc503fe10ed114dc9f395f0a3f3273e Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:50:11 -0500 Subject: [PATCH 42/49] fix test? --- .../sqllogictest/test_files/parquet_sorted_statistics.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index 93a3a4e56a22..76a7e5bd0df8 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -254,5 +254,5 @@ logical_plan 02)--TableScan: test_table projection=[constant_col] physical_plan 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] -02)--SortExec: expr=[constant_col@0 ASC NULLS LAST] +02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] From f3a69e53ff771637092e1caddb16e3b4f127a561 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:59:09 -0500 Subject: [PATCH 43/49] un-fix? --- .../sqllogictest/test_files/parquet_sorted_statistics.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index 76a7e5bd0df8..93a3a4e56a22 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -254,5 +254,5 @@ logical_plan 02)--TableScan: test_table projection=[constant_col] physical_plan 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] -02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] +02)--SortExec: expr=[constant_col@0 ASC NULLS LAST] 03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] From 1a010b768e6c9280e063d293a963fbc19ed882a0 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:14:54 -0500 Subject: [PATCH 44/49] add fix back in? --- .../sqllogictest/test_files/parquet_sorted_statistics.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index 93a3a4e56a22..76a7e5bd0df8 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -254,5 +254,5 @@ logical_plan 02)--TableScan: test_table projection=[constant_col] physical_plan 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] -02)--SortExec: expr=[constant_col@0 ASC NULLS LAST] +02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] From f41d1c92b76a253246bd93e4f5193f289e835d1c Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 1 May 2024 01:24:46 -0500 Subject: [PATCH 45/49] move indices_sorted_by_min to MinMaxStatistics --- .../core/src/datasource/physical_plan/file_scan_config.rs | 8 +------- datafusion/core/src/datasource/physical_plan/mod.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index d19f240f21bd..f7170288af58 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -231,13 +231,7 @@ impl FileScanConfig { e.context("construct min/max statistics for split_groups_by_statistics") })?; - let indices_sorted_by_min = { - let mut sort: Vec<_> = - statistics.min_by_sort_order.iter().enumerate().collect(); - sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - sort - }; - + let indices_sorted_by_min = statistics.min_values_sorted(); let mut file_groups_indices: Vec> = vec![]; for (idx, min) in indices_sorted_by_min { diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 5e6e02e2b290..eae3fb091bc2 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -65,6 +65,7 @@ use arrow::{ compute::{can_cast_types, cast, SortColumn}, datatypes::{DataType, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, + row::Row, }; use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::expressions::Column; @@ -722,6 +723,13 @@ impl MinMaxStatistics { }) } + // Return a sorted list of the min statistics together with the original indices + fn min_values_sorted(&self) -> Vec<(usize, Row<'_>)> { + let mut sort: Vec<_> = self.min_by_sort_order.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + sort + } + fn is_sorted(&self) -> bool { self.max_by_sort_order .iter() From 15e1339e6d41236e27515a5e35554a6ae59e9f89 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 1 May 2024 01:31:13 -0500 Subject: [PATCH 46/49] move MinMaxStatistics to its own module --- .../physical_plan/file_scan_config.rs | 6 +- .../core/src/datasource/physical_plan/mod.rs | 247 +--------------- .../datasource/physical_plan/statistics.rs | 273 ++++++++++++++++++ 3 files changed, 281 insertions(+), 245 deletions(-) create mode 100644 datafusion/core/src/datasource/physical_plan/statistics.rs diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index f7170288af58..4de7eb136f22 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -22,7 +22,9 @@ use std::{ borrow::Cow, collections::HashMap, fmt::Debug, marker::PhantomData, sync::Arc, vec, }; -use super::{get_projected_output_ordering, FileGroupPartitioner, MinMaxStatistics}; +use super::{ + get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupPartitioner, +}; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; use crate::{error::Result, scalar::ScalarValue}; @@ -238,7 +240,7 @@ impl FileScanConfig { let file_group_to_insert = file_groups_indices.iter_mut().find(|group| { // If our file is non-overlapping and comes _after_ the last file, // it fits in this file group. - min > statistics.max_by_sort_order.row( + min > statistics.max( *group .last() .expect("groups should be nonempty at construction"), diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index eae3fb091bc2..c450774572db 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -26,6 +26,7 @@ mod file_stream; mod json; #[cfg(feature = "parquet")] pub mod parquet; +mod statistics; pub(crate) use self::csv::plan_to_csv; pub(crate) use self::json::plan_to_json; @@ -62,12 +63,11 @@ use crate::{ use arrow::{ array::new_null_array, - compute::{can_cast_types, cast, SortColumn}, + compute::{can_cast_types, cast}, datatypes::{DataType, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, - row::Row, }; -use datafusion_common::{plan_err, DataFusionError}; +use datafusion_common::plan_err; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; @@ -483,7 +483,7 @@ fn get_projected_output_ordering( return false; } - let statistics = match MinMaxStatistics::new_from_files( + let statistics = match statistics::MinMaxStatistics::new_from_files( &new_ordering, projected_schema, base_config.projection.as_deref(), @@ -512,245 +512,6 @@ fn get_projected_output_ordering( all_orderings } -/// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. -/// The min/max values are ordered by [`Self::sort_order`]. -/// Furthermore, any columns that are reversed in the sort order have their min/max values swapped. -pub(crate) struct MinMaxStatistics { - min_by_sort_order: arrow::row::Rows, - max_by_sort_order: arrow::row::Rows, - sort_order: Vec, -} - -impl MinMaxStatistics { - #[allow(unused)] - fn sort_order(&self) -> &[PhysicalSortExpr] { - &self.sort_order - } - - fn new_from_files<'a>( - projected_sort_order: &[PhysicalSortExpr], // Sort order with respect to projected schema - projected_schema: &SchemaRef, // Projected schema - projection: Option<&[usize]>, // Indices of projection in full table schema (None = all columns) - files: impl IntoIterator, - ) -> Result { - use datafusion_common::ScalarValue; - - let statistics_and_partition_values = files - .into_iter() - .map(|file| { - file.statistics - .as_ref() - .zip(Some(file.partition_values.as_slice())) - }) - .collect::>>() - .ok_or_else(|| { - DataFusionError::Plan("Parquet file missing statistics".to_string()) - })?; - - // Helper function to get min/max statistics for a given column of projected_schema - let get_min_max = |i: usize| -> Result<(Vec, Vec)> { - Ok(statistics_and_partition_values - .iter() - .map(|(s, pv)| { - if i < s.column_statistics.len() { - s.column_statistics[i] - .min_value - .get_value() - .cloned() - .zip(s.column_statistics[i].max_value.get_value().cloned()) - .ok_or_else(|| { - DataFusionError::Plan("statistics not found".to_string()) - }) - } else { - let partition_value = &pv[i - s.column_statistics.len()]; - Ok((partition_value.clone(), partition_value.clone())) - } - }) - .collect::>>()? - .into_iter() - .unzip()) - }; - - let sort_columns = sort_columns_from_physical_sort_exprs(projected_sort_order) - .ok_or(DataFusionError::Plan( - "sort expression must be on column".to_string(), - ))?; - - // Project the schema & sort order down to just the relevant columns - let min_max_schema = Arc::new( - projected_schema - .project(&(sort_columns.iter().map(|c| c.index()).collect::>()))?, - ); - let min_max_sort_order = sort_columns - .iter() - .zip(projected_sort_order.iter()) - .enumerate() - .map(|(i, (col, sort))| PhysicalSortExpr { - expr: Arc::new(Column::new(col.name(), i)), - options: sort.options, - }) - .collect::>(); - - let (min_values, max_values): (Vec<_>, Vec<_>) = sort_columns - .iter() - .map(|c| { - // Reverse the projection to get the index of the column in the full statistics - // The file statistics contains _every_ column , but the sort column's index() - // refers to the index in projected_schema - let i = projection.map(|p| p[c.index()]).unwrap_or(c.index()); - - let (min, max) = get_min_max(i).map_err(|e| { - e.context(format!("get min/max for column: '{}'", c.name())) - })?; - Ok(( - ScalarValue::iter_to_array(min)?, - ScalarValue::iter_to_array(max)?, - )) - }) - .collect::>>() - .map_err(|e| e.context("collect min/max values"))? - .into_iter() - .unzip(); - - Self::new( - &min_max_sort_order, - &min_max_schema, - RecordBatch::try_new(Arc::clone(&min_max_schema), min_values).map_err( - |e| { - DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string())) - }, - )?, - RecordBatch::try_new(Arc::clone(&min_max_schema), max_values).map_err( - |e| { - DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string())) - }, - )?, - ) - } - - fn new( - sort_order: &[PhysicalSortExpr], - schema: &SchemaRef, - min_values: RecordBatch, - max_values: RecordBatch, - ) -> Result { - use arrow::row::*; - - let sort_fields = sort_order - .iter() - .map(|expr| { - expr.expr - .data_type(schema) - .map(|data_type| SortField::new_with_options(data_type, expr.options)) - }) - .collect::>>() - .map_err(|e| e.context("create sort fields"))?; - let converter = RowConverter::new(sort_fields)?; - - let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or( - DataFusionError::Plan("sort expression must be on column".to_string()), - )?; - - // swap min/max if they're reversed in the ordering - let (new_min_cols, new_max_cols): (Vec<_>, Vec<_>) = sort_order - .iter() - .zip(sort_columns.iter().copied()) - .map(|(sort_expr, column)| { - if sort_expr.options.descending { - max_values - .column_by_name(column.name()) - .zip(min_values.column_by_name(column.name())) - } else { - min_values - .column_by_name(column.name()) - .zip(max_values.column_by_name(column.name())) - } - .ok_or_else(|| { - DataFusionError::Plan(format!( - "missing column in MinMaxStatistics::new: '{}'", - column.name() - )) - }) - }) - .collect::>>()? - .into_iter() - .unzip(); - - let [min, max] = [new_min_cols, new_max_cols].map(|cols| { - let values = RecordBatch::try_new( - min_values.schema(), - cols.into_iter().cloned().collect(), - )?; - let sorting_columns = sort_order - .iter() - .zip(sort_columns.iter().copied()) - .map(|(sort_expr, column)| { - let schema = values.schema(); - - let idx = schema.index_of(column.name())?; - let field = schema.field(idx); - - // check that sort columns are non-nullable - if field.is_nullable() { - return Err(DataFusionError::Plan( - "cannot sort by nullable column".to_string(), - )); - } - - Ok(SortColumn { - values: Arc::clone(values.column(idx)), - options: Some(sort_expr.options), - }) - }) - .collect::>>() - .map_err(|e| e.context("create sorting columns"))?; - converter - .convert_columns( - &sorting_columns - .into_iter() - .map(|c| c.values) - .collect::>(), - ) - .map_err(|e| { - DataFusionError::ArrowError(e, Some("convert columns".to_string())) - }) - }); - - Ok(Self { - min_by_sort_order: min.map_err(|e| e.context("build min rows"))?, - max_by_sort_order: max.map_err(|e| e.context("build max rows"))?, - sort_order: sort_order.to_vec(), - }) - } - - // Return a sorted list of the min statistics together with the original indices - fn min_values_sorted(&self) -> Vec<(usize, Row<'_>)> { - let mut sort: Vec<_> = self.min_by_sort_order.iter().enumerate().collect(); - sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - sort - } - - fn is_sorted(&self) -> bool { - self.max_by_sort_order - .iter() - .zip(self.min_by_sort_order.iter().skip(1)) - .all(|(max, next_min)| max < next_min) - } -} - -fn sort_columns_from_physical_sort_exprs( - sort_order: &[PhysicalSortExpr], -) -> Option> { - sort_order - .iter() - .map(|expr| { - expr.expr - .as_any() - .downcast_ref::() - }) - .collect::>>() -} - /// Represents the possible outcomes of a range calculation. /// /// This enum is used to encapsulate the result of calculating the range of diff --git a/datafusion/core/src/datasource/physical_plan/statistics.rs b/datafusion/core/src/datasource/physical_plan/statistics.rs new file mode 100644 index 000000000000..159f9250a932 --- /dev/null +++ b/datafusion/core/src/datasource/physical_plan/statistics.rs @@ -0,0 +1,273 @@ +/* + +Use statistics to optimize physical planning. + +Currently, this module houses code to sort file groups if they are non-overlapping with +respect to the required sort order. See [`MinMaxStatistics`] + +*/ + +use std::sync::Arc; + +use arrow::{ + compute::SortColumn, + row::{Row, Rows}, +}; +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; +use datafusion_common::{DataFusionError, Result}; +use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr}; + +use crate::datasource::listing::PartitionedFile; + +/// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +/// The min/max values are ordered by [`Self::sort_order`]. +/// Furthermore, any columns that are reversed in the sort order have their min/max values swapped. +pub(crate) struct MinMaxStatistics { + min_by_sort_order: Rows, + max_by_sort_order: Rows, + sort_order: Vec, +} + +impl MinMaxStatistics { + /// Sort order used to sort the statistics + #[allow(unused)] + pub fn sort_order(&self) -> &[PhysicalSortExpr] { + &self.sort_order + } + + /// Min value at index + #[allow(unused)] + pub fn min(&self, idx: usize) -> Row { + self.min_by_sort_order.row(idx) + } + + /// Max value at index + pub fn max(&self, idx: usize) -> Row { + self.max_by_sort_order.row(idx) + } + + pub fn new_from_files<'a>( + projected_sort_order: &[PhysicalSortExpr], // Sort order with respect to projected schema + projected_schema: &SchemaRef, // Projected schema + projection: Option<&[usize]>, // Indices of projection in full table schema (None = all columns) + files: impl IntoIterator, + ) -> Result { + use datafusion_common::ScalarValue; + + let statistics_and_partition_values = files + .into_iter() + .map(|file| { + file.statistics + .as_ref() + .zip(Some(file.partition_values.as_slice())) + }) + .collect::>>() + .ok_or_else(|| { + DataFusionError::Plan("Parquet file missing statistics".to_string()) + })?; + + // Helper function to get min/max statistics for a given column of projected_schema + let get_min_max = |i: usize| -> Result<(Vec, Vec)> { + Ok(statistics_and_partition_values + .iter() + .map(|(s, pv)| { + if i < s.column_statistics.len() { + s.column_statistics[i] + .min_value + .get_value() + .cloned() + .zip(s.column_statistics[i].max_value.get_value().cloned()) + .ok_or_else(|| { + DataFusionError::Plan("statistics not found".to_string()) + }) + } else { + let partition_value = &pv[i - s.column_statistics.len()]; + Ok((partition_value.clone(), partition_value.clone())) + } + }) + .collect::>>()? + .into_iter() + .unzip()) + }; + + let sort_columns = sort_columns_from_physical_sort_exprs(projected_sort_order) + .ok_or(DataFusionError::Plan( + "sort expression must be on column".to_string(), + ))?; + + // Project the schema & sort order down to just the relevant columns + let min_max_schema = Arc::new( + projected_schema + .project(&(sort_columns.iter().map(|c| c.index()).collect::>()))?, + ); + let min_max_sort_order = sort_columns + .iter() + .zip(projected_sort_order.iter()) + .enumerate() + .map(|(i, (col, sort))| PhysicalSortExpr { + expr: Arc::new(Column::new(col.name(), i)), + options: sort.options, + }) + .collect::>(); + + let (min_values, max_values): (Vec<_>, Vec<_>) = sort_columns + .iter() + .map(|c| { + // Reverse the projection to get the index of the column in the full statistics + // The file statistics contains _every_ column , but the sort column's index() + // refers to the index in projected_schema + let i = projection.map(|p| p[c.index()]).unwrap_or(c.index()); + + let (min, max) = get_min_max(i).map_err(|e| { + e.context(format!("get min/max for column: '{}'", c.name())) + })?; + Ok(( + ScalarValue::iter_to_array(min)?, + ScalarValue::iter_to_array(max)?, + )) + }) + .collect::>>() + .map_err(|e| e.context("collect min/max values"))? + .into_iter() + .unzip(); + + Self::new( + &min_max_sort_order, + &min_max_schema, + RecordBatch::try_new(Arc::clone(&min_max_schema), min_values).map_err( + |e| { + DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string())) + }, + )?, + RecordBatch::try_new(Arc::clone(&min_max_schema), max_values).map_err( + |e| { + DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string())) + }, + )?, + ) + } + + pub fn new( + sort_order: &[PhysicalSortExpr], + schema: &SchemaRef, + min_values: RecordBatch, + max_values: RecordBatch, + ) -> Result { + use arrow::row::*; + + let sort_fields = sort_order + .iter() + .map(|expr| { + expr.expr + .data_type(schema) + .map(|data_type| SortField::new_with_options(data_type, expr.options)) + }) + .collect::>>() + .map_err(|e| e.context("create sort fields"))?; + let converter = RowConverter::new(sort_fields)?; + + let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or( + DataFusionError::Plan("sort expression must be on column".to_string()), + )?; + + // swap min/max if they're reversed in the ordering + let (new_min_cols, new_max_cols): (Vec<_>, Vec<_>) = sort_order + .iter() + .zip(sort_columns.iter().copied()) + .map(|(sort_expr, column)| { + if sort_expr.options.descending { + max_values + .column_by_name(column.name()) + .zip(min_values.column_by_name(column.name())) + } else { + min_values + .column_by_name(column.name()) + .zip(max_values.column_by_name(column.name())) + } + .ok_or_else(|| { + DataFusionError::Plan(format!( + "missing column in MinMaxStatistics::new: '{}'", + column.name() + )) + }) + }) + .collect::>>()? + .into_iter() + .unzip(); + + let [min, max] = [new_min_cols, new_max_cols].map(|cols| { + let values = RecordBatch::try_new( + min_values.schema(), + cols.into_iter().cloned().collect(), + )?; + let sorting_columns = sort_order + .iter() + .zip(sort_columns.iter().copied()) + .map(|(sort_expr, column)| { + let schema = values.schema(); + + let idx = schema.index_of(column.name())?; + let field = schema.field(idx); + + // check that sort columns are non-nullable + if field.is_nullable() { + return Err(DataFusionError::Plan( + "cannot sort by nullable column".to_string(), + )); + } + + Ok(SortColumn { + values: Arc::clone(values.column(idx)), + options: Some(sort_expr.options), + }) + }) + .collect::>>() + .map_err(|e| e.context("create sorting columns"))?; + converter + .convert_columns( + &sorting_columns + .into_iter() + .map(|c| c.values) + .collect::>(), + ) + .map_err(|e| { + DataFusionError::ArrowError(e, Some("convert columns".to_string())) + }) + }); + + Ok(Self { + min_by_sort_order: min.map_err(|e| e.context("build min rows"))?, + max_by_sort_order: max.map_err(|e| e.context("build max rows"))?, + sort_order: sort_order.to_vec(), + }) + } + + /// Return a sorted list of the min statistics together with the original indices + pub fn min_values_sorted(&self) -> Vec<(usize, Row<'_>)> { + let mut sort: Vec<_> = self.min_by_sort_order.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + sort + } + + /// Check if the min/max statistics are in order and non-overlapping + pub fn is_sorted(&self) -> bool { + self.max_by_sort_order + .iter() + .zip(self.min_by_sort_order.iter().skip(1)) + .all(|(max, next_min)| max < next_min) + } +} + +fn sort_columns_from_physical_sort_exprs( + sort_order: &[PhysicalSortExpr], +) -> Option> { + sort_order + .iter() + .map(|expr| { + expr.expr + .as_any() + .downcast_ref::() + }) + .collect::>>() +} From a2c9b4e44e3363eaf518c0c7c67e528866a34f6a Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 1 May 2024 01:40:24 -0500 Subject: [PATCH 47/49] fix license --- .../datasource/physical_plan/statistics.rs | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/statistics.rs b/datafusion/core/src/datasource/physical_plan/statistics.rs index 159f9250a932..e1c61ec1a712 100644 --- a/datafusion/core/src/datasource/physical_plan/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/statistics.rs @@ -1,10 +1,27 @@ -/* - -Use statistics to optimize physical planning. - -Currently, this module houses code to sort file groups if they are non-overlapping with -respect to the required sort order. See [`MinMaxStatistics`] +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +/*! + * + * Use statistics to optimize physical planning. + * + * Currently, this module houses code to sort file groups if they are non-overlapping with + * respect to the required sort order. See [`MinMaxStatistics`] + * */ use std::sync::Arc; From d7c9af6f06595eb7bbeadcfa91401540613f47c9 Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 1 May 2024 01:50:43 -0500 Subject: [PATCH 48/49] add feature flag --- datafusion/common/src/config.rs | 5 +++++ .../core/src/datasource/listing/table.rs | 22 +++++++++++++------ .../test_files/information_schema.slt | 2 ++ .../test_files/parquet_sorted_statistics.slt | 4 ++++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 30ab9a339b54..6481211d3f23 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -297,6 +297,11 @@ config_namespace! { /// Should DataFusion support recursive CTEs pub enable_recursive_ctes: bool, default = true + + /// Attempt to eliminate sorts by packing & sorting files with non-overlapping + /// statistics into the same file groups. + /// Currently experimental + pub split_file_groups_by_statistics: bool, default = false } } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 9e4d96ca7180..4b1994a1797c 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -750,13 +750,21 @@ impl TableProvider for ListingTable { } let output_ordering = self.try_create_output_ordering()?; - match output_ordering.first().map(|output_ordering| { - FileScanConfig::split_groups_by_statistics( - &self.table_schema, - &partitioned_file_lists, - output_ordering, - ) - }) { + match state + .config_options() + .execution + .split_file_groups_by_statistics + .then(|| { + output_ordering.first().map(|output_ordering| { + FileScanConfig::split_groups_by_statistics( + &self.table_schema, + &partitioned_file_lists, + output_ordering, + ) + }) + }) + .flatten() + { Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"), Some(Ok(new_groups)) => { if new_groups.len() <= self.options.target_partitions { diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8f4b1a3816a3..4fb3d03fe57f 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -200,6 +200,7 @@ datafusion.execution.planning_concurrency 13 datafusion.execution.soft_max_rows_per_output_file 50000000 datafusion.execution.sort_in_place_threshold_bytes 1048576 datafusion.execution.sort_spill_reservation_bytes 10485760 +datafusion.execution.split_file_groups_by_statistics false datafusion.execution.target_partitions 7 datafusion.execution.time_zone +00:00 datafusion.explain.logical_plan_only false @@ -278,6 +279,7 @@ datafusion.execution.planning_concurrency 13 Fan-out during initial physical pla datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). +datafusion.execution.split_file_groups_by_statistics false Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental datafusion.execution.target_partitions 7 Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system datafusion.execution.time_zone +00:00 The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour datafusion.explain.logical_plan_only false When set to true, the explain statement will only print logical plans diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index 76a7e5bd0df8..f7a81f08456f 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -25,6 +25,10 @@ set datafusion.execution.target_partitions = 2; statement ok set datafusion.execution.collect_statistics = true; +# Enable split_file_groups_by_statistics since it's currently disabled by default +statement ok +set datafusion.execution.split_file_groups_by_statistics = true; + # Create a table as a data source statement ok CREATE TABLE src_table ( From 82166fdffb17cd115522b4cbf0ce8c158a122a7c Mon Sep 17 00:00:00 2001 From: suremarc <8771538+suremarc@users.noreply.github.com> Date: Wed, 1 May 2024 10:07:44 -0500 Subject: [PATCH 49/49] update config --- docs/source/user-guide/configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3ee3778177c4..822a2891894d 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -84,6 +84,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | | datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | | datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | +| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible |