From e6731b549f7649c52243ac23db744bbbe4c11bcb Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Mon, 25 Nov 2024 17:15:04 -0600 Subject: [PATCH 1/4] fix: Encoding of List offsets was incorrect when slice offsets begin with zero When encoding offsets the code had an optimization to reuse the offsets if the first offset was zero assuming the slice already pointed to first element. But the offset can also be zero if all previous lists were empty. When this occured it mold make all lists in the slice as empty, even if they shouldn't be. --- arrow-ipc/src/writer.rs | 47 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index e6fc9d81df67..158349891836 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1429,8 +1429,8 @@ fn reencode_offsets( let start_offset = offset_slice.first().unwrap(); let end_offset = offset_slice.last().unwrap(); - let offsets = match start_offset.as_usize() { - 0 => offsets.clone(), + let offsets: Buffer = match start_offset.as_usize() { + 0 => offset_slice.iter().copied().collect(), _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; @@ -2517,6 +2517,36 @@ mod tests { ls.finish() } + fn generate_nested_list_data_starting_at_zero() -> GenericListArray { + let mut ls = + GenericListBuilder::::new(GenericListBuilder::::new(UInt32Builder::new())); + + for _i in 0..999 { + ls.values().append(true); + ls.append(true); + } + + for j in 0..10 { + for value in [j, j, j, j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + + for i in 0..9_000 { + for j in 0..10 { + for value in [i + j, i + j, i + j, i + j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + } + + ls.finish() + } + fn generate_map_array_data() -> MapArray { let keys_builder = UInt32Builder::new(); let values_builder = UInt32Builder::new(); @@ -2608,6 +2638,19 @@ mod tests { roundtrip_ensure_sliced_smaller(in_batch, 1000); } + #[test] + fn encode_nested_lists_starting_at_zero() { + let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let list_field = Field::new("val", DataType::List(inner_list_field), true); + let schema = Arc::new(Schema::new(vec![list_field])); + + let values = Arc::new(generate_nested_list_data_starting_at_zero::()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1); + } + #[test] fn encode_map_array() { let keys = Arc::new(Field::new("keys", DataType::UInt32, false)); From 7959fac720e926680490f1f72f18feba2ed2cf4a Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Tue, 26 Nov 2024 20:43:19 -0600 Subject: [PATCH 2/4] Use Buffer::from_slice_ref which will be faster as it doesn't iterate through the slice. --- arrow-ipc/src/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 158349891836..a7ea2225323b 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1430,7 +1430,7 @@ fn reencode_offsets( let end_offset = offset_slice.last().unwrap(); let offsets: Buffer = match start_offset.as_usize() { - 0 => offset_slice.iter().copied().collect(), + 0 => Buffer::from_slice_ref(offset_slice), _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; From 88da5f1a4b1def6a24bc8df86003462f7290eae1 Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Wed, 27 Nov 2024 07:29:48 -0600 Subject: [PATCH 3/4] Avoid copying --- arrow-ipc/src/writer.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index a7ea2225323b..9649851f2793 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1429,8 +1429,14 @@ fn reencode_offsets( let start_offset = offset_slice.first().unwrap(); let end_offset = offset_slice.last().unwrap(); - let offsets: Buffer = match start_offset.as_usize() { - 0 => Buffer::from_slice_ref(offset_slice), + let offsets = match start_offset.as_usize() { + 0 => { + let size = size_of::(); + offsets.slice_with_length( + data.offset() * size, + (data.offset() + data.len() + 1) * size, + ) + } _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; From 2381a598959e4f9197155fd599f632ba7803be8c Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Fri, 29 Nov 2024 08:48:07 -0600 Subject: [PATCH 4/4] Explicitly reference std::mem::size_of --- arrow-ipc/src/writer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 9649851f2793..05a60664201f 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -23,6 +23,7 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; +use std::mem::size_of; use std::sync::Arc; use flatbuffers::FlatBufferBuilder;