Skip to content

Commit

Permalink
fix: Encoding of List offsets was incorrect when slice offsets begin …
Browse files Browse the repository at this point in the history
…with zero (#6805) (#6943)

* fix: Encoding of List offsets was incorrect when slice offsets begin with zero

When encoding offsets the code had an optimization to reuse the offsets if the first offset was zero assuming the slice already pointed
 to first element. But the offset can also be zero if all previous lists were empty. When this occured it mold make all lists in the
slice as empty, even if they shouldn't be.

* Use Buffer::from_slice_ref which will be faster as it doesn't iterate through the slice.

* Avoid copying

* Explicitly reference std::mem::size_of

Co-authored-by: Michael Maletich <[email protected]>
  • Loading branch information
alamb and HawaiianSpork authored Jan 8, 2025
1 parent 3a93166 commit 955180b
Showing 1 changed file with 51 additions and 1 deletion.
52 changes: 51 additions & 1 deletion arrow-ipc/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
use std::cmp::min;
use std::collections::HashMap;
use std::io::{BufWriter, Write};
use std::mem::size_of;
use std::sync::Arc;

use flatbuffers::FlatBufferBuilder;
Expand Down Expand Up @@ -1430,7 +1431,13 @@ fn reencode_offsets<O: OffsetSizeTrait>(
let end_offset = offset_slice.last().unwrap();

let offsets = match start_offset.as_usize() {
0 => offsets.clone(),
0 => {
let size = size_of::<O>();
offsets.slice_with_length(
data.offset() * size,
(data.offset() + data.len() + 1) * size,
)
}
_ => offset_slice.iter().map(|x| *x - *start_offset).collect(),
};

Expand Down Expand Up @@ -2517,6 +2524,36 @@ mod tests {
ls.finish()
}

fn generate_nested_list_data_starting_at_zero<O: OffsetSizeTrait>() -> GenericListArray<O> {
let mut ls =
GenericListBuilder::<O, _>::new(GenericListBuilder::<O, _>::new(UInt32Builder::new()));

for _i in 0..999 {
ls.values().append(true);
ls.append(true);
}

for j in 0..10 {
for value in [j, j, j, j] {
ls.values().values().append_value(value);
}
ls.values().append(true)
}
ls.append(true);

for i in 0..9_000 {
for j in 0..10 {
for value in [i + j, i + j, i + j, i + j] {
ls.values().values().append_value(value);
}
ls.values().append(true)
}
ls.append(true);
}

ls.finish()
}

fn generate_map_array_data() -> MapArray {
let keys_builder = UInt32Builder::new();
let values_builder = UInt32Builder::new();
Expand Down Expand Up @@ -2608,6 +2645,19 @@ mod tests {
roundtrip_ensure_sliced_smaller(in_batch, 1000);
}

#[test]
fn encode_nested_lists_starting_at_zero() {
let inner_int = Arc::new(Field::new("item", DataType::UInt32, true));
let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true));
let list_field = Field::new("val", DataType::List(inner_list_field), true);
let schema = Arc::new(Schema::new(vec![list_field]));

let values = Arc::new(generate_nested_list_data_starting_at_zero::<i32>());

let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
roundtrip_ensure_sliced_smaller(in_batch, 1);
}

#[test]
fn encode_map_array() {
let keys = Arc::new(Field::new("keys", DataType::UInt32, false));
Expand Down

0 comments on commit 955180b

Please sign in to comment.