-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix compute_record_batch_statistics
wrong with projection
#8489
Changes from all commits
7afeb8b
6332bec
cc5e0c7
a114310
928c811
839093e
a836cde
5648dc7
a670409
22894a3
73a59d2
46409c2
8a86a4c
cf5c584
62ae9b9
da02fa2
d98eb2e
79e7216
ba51abd
2468f52
180c303
68980ba
9411940
ba28346
df0942f
edccb66
fb74b99
767b004
2e0eef5
749e0c8
5d43a94
71047f3
4b6921b
deefdd0
c00027e
d46a9f9
41a520f
632b460
d19294f
928cbb1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ use crate::{ColumnStatistics, ExecutionPlan, Statistics}; | |
use arrow::datatypes::Schema; | ||
use arrow::ipc::writer::{FileWriter, IpcWriteOptions}; | ||
use arrow::record_batch::RecordBatch; | ||
use arrow_array::Array; | ||
use datafusion_common::stats::Precision; | ||
use datafusion_common::{plan_err, DataFusionError, Result}; | ||
use datafusion_execution::memory_pool::MemoryReservation; | ||
|
@@ -139,17 +140,22 @@ pub fn compute_record_batch_statistics( | |
) -> Statistics { | ||
let nb_rows = batches.iter().flatten().map(RecordBatch::num_rows).sum(); | ||
|
||
let total_byte_size = batches | ||
.iter() | ||
.flatten() | ||
.map(|b| b.get_array_memory_size()) | ||
.sum(); | ||
|
||
let projection = match projection { | ||
Some(p) => p, | ||
None => (0..schema.fields().len()).collect(), | ||
}; | ||
|
||
let total_byte_size = batches | ||
.iter() | ||
.flatten() | ||
.map(|b| { | ||
projection | ||
.iter() | ||
.map(|index| b.column(*index).get_array_memory_size()) | ||
.sum::<usize>() | ||
}) | ||
.sum(); | ||
|
||
let mut column_statistics = vec![ColumnStatistics::new_unknown(); projection.len()]; | ||
|
||
for partition in batches.iter() { | ||
|
@@ -388,6 +394,7 @@ mod tests { | |
datatypes::{DataType, Field, Schema}, | ||
record_batch::RecordBatch, | ||
}; | ||
use arrow_array::UInt64Array; | ||
use datafusion_expr::Operator; | ||
use datafusion_physical_expr::expressions::{col, Column}; | ||
|
||
|
@@ -685,20 +692,30 @@ mod tests { | |
let schema = Arc::new(Schema::new(vec![ | ||
Field::new("f32", DataType::Float32, false), | ||
Field::new("f64", DataType::Float64, false), | ||
Field::new("u64", DataType::UInt64, false), | ||
])); | ||
let batch = RecordBatch::try_new( | ||
Arc::clone(&schema), | ||
vec![ | ||
Arc::new(Float32Array::from(vec![1., 2., 3.])), | ||
Arc::new(Float64Array::from(vec![9., 8., 7.])), | ||
Arc::new(UInt64Array::from(vec![4, 5, 6])), | ||
], | ||
)?; | ||
|
||
// just select f32,f64 | ||
let select_projection = Some(vec![0, 1]); | ||
let byte_size = batch | ||
.project(&select_projection.clone().unwrap()) | ||
.unwrap() | ||
.get_array_memory_size(); | ||
|
||
let actual = | ||
compute_record_batch_statistics(&[vec![batch]], &schema, Some(vec![0, 1])); | ||
compute_record_batch_statistics(&[vec![batch]], &schema, select_projection); | ||
|
||
let mut expected = Statistics { | ||
let expected = Statistics { | ||
num_rows: Precision::Exact(3), | ||
total_byte_size: Precision::Exact(464), // this might change a bit if the way we compute the size changes | ||
total_byte_size: Precision::Exact(byte_size), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if this is appropriate, if you have any good suggestions please leave a message There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is ok and a nice way to make the code less brittle to future changes in arrow's layout There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious as to why the previous code was There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it happens to be the (current) size of the record batch in the test: let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Float32Array::from(vec![1., 2., 3.])),
Arc::new(Float64Array::from(vec![9., 8., 7.])),
Arc::new(UInt64Array::from(vec![4, 5, 6])),
],
)?; |
||
column_statistics: vec![ | ||
ColumnStatistics { | ||
distinct_count: Precision::Absent, | ||
|
@@ -715,9 +732,6 @@ mod tests { | |
], | ||
}; | ||
|
||
// Prevent test flakiness due to undefined / changing implementation details | ||
expected.total_byte_size = actual.total_byte_size.clone(); | ||
|
||
assert_eq!(actual, expected); | ||
Ok(()) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2021,14 +2021,15 @@ SortPreservingMergeExec: [col0@0 ASC NULLS LAST] | |
----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4 | ||
------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[LAST_VALUE(r.col1)], ordering_mode=PartiallySorted([0]) | ||
--------------SortExec: expr=[col0@3 ASC NULLS LAST] | ||
----------------CoalesceBatchesExec: target_batch_size=8192 | ||
------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col0@0, col0@0)] | ||
--------------------CoalesceBatchesExec: target_batch_size=8192 | ||
----------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=1 | ||
------------------------MemoryExec: partitions=1, partition_sizes=[3] | ||
--------------------CoalesceBatchesExec: target_batch_size=8192 | ||
----------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=1 | ||
------------------------MemoryExec: partitions=1, partition_sizes=[3] | ||
----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks to me like the change is due to the fact the join inputs were reordered and this projection puts the columns back the expected way. Same thing with the projection below |
||
------------------CoalesceBatchesExec: target_batch_size=8192 | ||
--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col0@0, col0@0)] | ||
----------------------CoalesceBatchesExec: target_batch_size=8192 | ||
------------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=1 | ||
--------------------------MemoryExec: partitions=1, partition_sizes=[3] | ||
----------------------CoalesceBatchesExec: target_batch_size=8192 | ||
------------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=1 | ||
--------------------------MemoryExec: partitions=1, partition_sizes=[3] | ||
|
||
# Columns in the table are a,b,c,d. Source is CsvExec which is ordered by | ||
# a,b,c column. Column a has cardinality 2, column b has cardinality 4. | ||
|
@@ -2709,9 +2710,9 @@ SortExec: expr=[sn@2 ASC NULLS LAST] | |
--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, LAST_VALUE(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate] | ||
----AggregateExec: mode=Single, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency], aggr=[LAST_VALUE(e.amount)] | ||
------SortExec: expr=[sn@5 ASC NULLS LAST] | ||
--------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, sn@5 as sn, amount@8 as amount] | ||
--------ProjectionExec: expr=[zip_code@4 as zip_code, country@5 as country, sn@6 as sn, ts@7 as ts, currency@8 as currency, sn@0 as sn, amount@3 as amount] | ||
----------CoalesceBatchesExec: target_batch_size=8192 | ||
------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@4, currency@2)], filter=ts@0 >= ts@1 | ||
------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1 | ||
--------------MemoryExec: partitions=1, partition_sizes=[1] | ||
--------------MemoryExec: partitions=1, partition_sizes=[1] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍