From 64d81811d4007779955de12a7fc6fa34ea19e2f6 Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Mon, 25 Nov 2024 23:31:53 -0600 Subject: [PATCH] fix: Encoding of List offsets was incorrect when slice offsets begin with zero When encoding offsets the code had an optimization to reuse the offsets if the first offset was zero assuming the slice already pointed to first element. But the offset can also be zero if all previous lists were empty. When this occured it mold make all lists in the slice as empty, even if they shouldn't be. --- arrow-ipc/src/writer.rs | 47 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index d0a78ca2702e..d414e08b5b6c 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1309,7 +1309,7 @@ fn reencode_offsets( let end_offset = offset_slice.last().unwrap(); let offsets = match start_offset.as_usize() { - 0 => offsets.clone(), + 0 => offset_slice.iter().copied().collect(), _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; @@ -2396,6 +2396,38 @@ mod tests { ls.finish() } + fn generate_nested_list_data_starting_at_zero() -> GenericListArray { + let mut ls = + GenericListBuilder::::new(GenericListBuilder::::new(UInt32Builder::new())); + + + for _i in 0..999 { + ls.values().append(true); + ls.append(true); + } + + for j in 0..10 { + for value in [j, j, j, j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + + + for i in 0..9_000 { + for j in 0..10 { + for value in [i+j, i+j, i+j, i+j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + } + + ls.finish() + } + fn generate_map_array_data() -> MapArray { let keys_builder = UInt32Builder::new(); let values_builder = UInt32Builder::new(); @@ -2487,6 +2519,19 @@ mod tests { roundtrip_ensure_sliced_smaller(in_batch, 1000); } + #[test] + fn encode_nested_lists_starting_at_zero() { + let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let list_field = Field::new("val", DataType::List(inner_list_field), true); + let schema = Arc::new(Schema::new(vec![list_field])); + + let values = Arc::new(generate_nested_list_data_starting_at_zero::()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1000); + } + #[test] fn encode_map_array() { let keys = Arc::new(Field::new("keys", DataType::UInt32, false));