From d0f2159091f9a465aceece203d6470ebe71a8edf Mon Sep 17 00:00:00 2001 From: slo Date: Mon, 9 Jan 2023 23:08:52 +0000 Subject: [PATCH 1/4] Add a function to get memory size of array slice --- arrow-data/src/data.rs | 89 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 918ecae847a9..aa4c18bb8953 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -463,6 +463,59 @@ impl ArrayData { size } + pub fn get_slice_memory_size(&self) -> Result { + let mut result: usize = 0; + let layout = layout(&self.data_type); + + for spec in layout.buffers.iter() { + match spec { + BufferSpec::FixedWidth { byte_width } => { + let buffer_size = self + .len + .checked_mul(*byte_width) + .expect("integer overflow computing buffer size"); + result += buffer_size; + } + BufferSpec::VariableWidth => { + let buffer_len: usize; + match self.data_type { + DataType::Utf8 | DataType::Binary => { + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0] ) as usize; + } + DataType::LargeUtf8 | DataType::LargeBinary => { + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0]) as usize; + } + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid data type for VariableWidth buffer. Expected Utg8, LargeUtf8, Binary or LargeBinary. Got {}", + self.data_type + ))) + } + }; + result += buffer_len; + } + BufferSpec::BitMap => { + let buffer_size = bit_util::ceil(self.len, 8); + result += buffer_size; + } + BufferSpec::AlwaysNull => { + // Nothing to do + } + } + } + + if self.null_bitmap().is_some() { + result += bit_util::ceil(self.len, 8); + } + + for child in &self.child_data { + result += child.get_slice_memory_size()?; + } + Ok(result) + } + /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. pub fn get_array_memory_size(&self) -> usize { let mut size = mem::size_of_val(self); @@ -1838,6 +1891,42 @@ mod tests { assert!(!string_data_slice.ptr_eq(&string_data)) } + #[test] + fn test_slice_memory_size() { + let mut bit_v: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut bit_v, 0); + bit_util::set_bit(&mut bit_v, 3); + bit_util::set_bit(&mut bit_v, 10); + let data = ArrayData::builder(DataType::Int32) + .len(16) + .add_buffer(make_i32_buffer(16)) + .null_bit_buffer(Some(Buffer::from(bit_v))) + .build() + .unwrap(); + let new_data = data.slice(1, 14); + assert_eq!( + data.get_slice_memory_size().unwrap() - 8, + new_data.get_slice_memory_size().unwrap() + ); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); + let string_data = ArrayData::try_new( + DataType::Utf8, + 3, + Some(Buffer::from_iter(vec![true, false, true])), + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + let string_data_slice = string_data.slice(1, 2); + //4 bytes of offset and 2 bytes of data reduced by slicing. + assert_eq!( + string_data.get_slice_memory_size().unwrap() - 6, + string_data_slice.get_slice_memory_size().unwrap() + ); + } + #[test] fn test_count_nulls() { let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111])); From 9f6dae149c021ff7bac19ede9be0cdbe5404d55e Mon Sep 17 00:00:00 2001 From: slo Date: Mon, 9 Jan 2023 23:16:41 +0000 Subject: [PATCH 2/4] typo fix --- arrow-data/src/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index aa4c18bb8953..768fd46ed43b 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -489,7 +489,7 @@ impl ArrayData { } _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Invalid data type for VariableWidth buffer. Expected Utg8, LargeUtf8, Binary or LargeBinary. Got {}", + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", self.data_type ))) } From 9d53dab62dd8715fd5b2f2d35c116ce86dc97ed1 Mon Sep 17 00:00:00 2001 From: slo Date: Mon, 9 Jan 2023 23:55:58 +0000 Subject: [PATCH 3/4] PR comments --- arrow-data/src/data.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 768fd46ed43b..a5ea1253a55f 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -463,6 +463,7 @@ impl ArrayData { size } + /// Returns the total number of the bytes of memory occupied by the buffers by this slice of [ArrayData] pub fn get_slice_memory_size(&self) -> Result { let mut result: usize = 0; let layout = layout(&self.data_type); @@ -470,10 +471,12 @@ impl ArrayData { for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width } => { - let buffer_size = self - .len - .checked_mul(*byte_width) - .expect("integer overflow computing buffer size"); + let buffer_size = + self.len.checked_mul(*byte_width).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Integer overflow computing buffer size".to_string(), + ) + })?; result += buffer_size; } BufferSpec::VariableWidth => { From a7f8dbd3e442d1de1654a8a3142b218ce6860b31 Mon Sep 17 00:00:00 2001 From: slo Date: Tue, 10 Jan 2023 00:00:16 +0000 Subject: [PATCH 4/4] Fix error types --- arrow-data/src/data.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index a5ea1253a55f..31dad5e82668 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -473,7 +473,7 @@ impl ArrayData { BufferSpec::FixedWidth { byte_width } => { let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { - ArrowError::InvalidArgumentError( + ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) })?; @@ -491,7 +491,7 @@ impl ArrayData { buffer_len = (offsets[self.len] - offsets[0]) as usize; } _ => { - return Err(ArrowError::InvalidArgumentError(format!( + return Err(ArrowError::NotYetImplemented(format!( "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", self.data_type )))