From 71be2e7378f55a6e85889957554801bac981b61c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 23 Mar 2023 22:53:19 +0000 Subject: [PATCH] Deprecate Array::data_ref (#3880) --- arrow-array/src/array/binary_array.rs | 7 +- .../src/array/fixed_size_binary_array.rs | 8 +- arrow-array/src/array/mod.rs | 27 +++--- arrow-cast/src/cast.rs | 24 ++---- arrow-ord/src/comparison.rs | 83 +++---------------- arrow-select/src/take.rs | 26 +++--- arrow-select/src/window.rs | 2 +- arrow/tests/array_validation.rs | 29 +------ 8 files changed, 62 insertions(+), 144 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index b965279fb796..e63819505c16 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -47,13 +47,14 @@ impl GenericBinaryArray { } fn from_list(v: GenericListArray) -> Self { + let v = v.into_data(); assert_eq!( - v.data_ref().child_data().len(), + v.child_data().len(), 1, "BinaryArray can only be created from list array of u8 values \ (i.e. List>)." ); - let child_data = &v.data_ref().child_data()[0]; + let child_data = &v.child_data()[0]; assert_eq!( child_data.child_data().len(), @@ -75,7 +76,7 @@ impl GenericBinaryArray { let builder = ArrayData::builder(Self::DATA_TYPE) .len(v.len()) .offset(v.offset()) - .add_buffer(v.data_ref().buffers()[0].clone()) + .add_buffer(v.buffers()[0].clone()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .nulls(v.nulls().cloned()); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index af51ff787722..fd243396facf 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -396,13 +396,15 @@ impl From for ArrayData { /// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array impl From for FixedSizeBinaryArray { fn from(v: FixedSizeListArray) -> Self { + let value_len = v.value_length(); + let v = v.into_data(); assert_eq!( - v.data_ref().child_data().len(), + v.child_data().len(), 1, "FixedSizeBinaryArray can only be created from list array of u8 values \ (i.e. FixedSizeList>)." ); - let child_data = &v.data_ref().child_data()[0]; + let child_data = &v.child_data()[0]; assert_eq!( child_data.child_data().len(), @@ -421,7 +423,7 @@ impl From for FixedSizeBinaryArray { "The child array cannot contain null values." ); - let builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) + let builder = ArrayData::builder(DataType::FixedSizeBinary(value_len)) .len(v.len()) .offset(v.offset()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9afefc07f8d4..aa98e4b5ace6 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -110,6 +110,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns a reference-counted pointer to the underlying data of this array. /// /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) + #[deprecated(note = "Use Array::to_data or Array::into_data")] fn data_ref(&self) -> &ArrayData { self.data() } @@ -126,6 +127,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(*array.data_type(), DataType::Int32); /// ``` + #[allow(deprecated)] // (#3880) fn data_type(&self) -> &DataType { self.data_ref().data_type() } @@ -156,6 +158,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.len(), 5); /// ``` + #[allow(deprecated)] // (#3880) fn len(&self) -> usize { self.data_ref().len() } @@ -171,6 +174,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.is_empty(), false); /// ``` + #[allow(deprecated)] // (#3880) fn is_empty(&self) -> bool { self.data_ref().is_empty() } @@ -191,6 +195,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.offset(), 0); /// assert_eq!(array_slice.offset(), 1); /// ``` + #[allow(deprecated)] // (#3880) fn offset(&self) -> usize { self.data_ref().offset() } @@ -250,6 +255,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. + #[allow(deprecated)] // (#3880) fn get_buffer_memory_size(&self) -> usize { self.data_ref().get_buffer_memory_size() } @@ -257,6 +263,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory occupied physically by this array. /// This value will always be greater than returned by `get_buffer_memory_size()` and /// includes the overhead of the data structures that contain the pointers to the various buffers. + #[allow(deprecated)] // (#3880) fn get_array_memory_size(&self) -> usize { // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, // to only count additional fields of this array substract size_of(ArrayData) @@ -286,6 +293,7 @@ impl Array for ArrayRef { self.data().clone() } + #[allow(deprecated)] fn data_ref(&self) -> &ArrayData { self.as_ref().data_ref() } @@ -352,6 +360,7 @@ impl<'a, T: Array> Array for &'a T { self.data().clone() } + #[allow(deprecated)] fn data_ref(&self) -> &ArrayData { T::data_ref(self) } @@ -997,19 +1006,17 @@ mod tests { (0..256).map(|i| (i % values.len()) as i16), ); - let dict_data = ArrayData::builder(DataType::Dictionary( + let dict_data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), - )) - .len(keys.len()) - .buffers(keys.data_ref().buffers().to_vec()) - .child_data(vec![ArrayData::builder(DataType::Int64) - .len(values.len()) - .buffers(values.data_ref().buffers().to_vec()) + ); + let dict_data = keys + .into_data() + .into_builder() + .data_type(dict_data_type) + .child_data(vec![values.into_data()]) .build() - .unwrap()]) - .build() - .unwrap(); + .unwrap(); let empty_data = ArrayData::new_empty(&DataType::Dictionary( Box::new(DataType::Int16), diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ba909649da3a..806ff8771573 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3448,9 +3448,9 @@ where OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, OffsetSizeTo: OffsetSizeTrait + NumCast, { - let data = array.data_ref(); + let list = array.as_list::(); // the value data stored by the list - let value_data = data.child_data()[0].clone(); + let values = list.values(); let out_dtype = match array.data_type() { DataType::List(value_type) => { @@ -3473,7 +3473,7 @@ where std::mem::size_of::(), std::mem::size_of::() ); - if value_data.len() > i32::MAX as usize { + if values.len() > i32::MAX as usize { return Err(ArrowError::ComputeError( "LargeList too large to cast to List".into(), )); @@ -3484,14 +3484,7 @@ where _ => unreachable!(), }; - // Safety: - // The first buffer is the offsets and they are aligned to OffSetSizeFrom: (i64 or i32) - // Justification: - // The safe variant data.buffer:: take the offset into account and we - // cannot create a list array with offsets starting at non zero. - let offsets = unsafe { data.buffers()[0].as_slice().align_to::() }.1; - - let iter = offsets.iter().map(|idx| { + let iter = list.value_offsets().iter().map(|idx| { let idx: OffsetSizeTo = NumCast::from(*idx).unwrap(); idx }); @@ -3502,14 +3495,13 @@ where // wrap up let builder = ArrayData::builder(out_dtype) - .offset(array.offset()) - .len(array.len()) + .len(list.len()) .add_buffer(offset_buffer) - .add_child_data(value_data) - .nulls(data.nulls().cloned()); + .add_child_data(values.to_data()) + .nulls(list.nulls().cloned()); let array_data = unsafe { builder.build_unchecked() }; - Ok(make_array(array_data)) + Ok(Arc::new(GenericListArray::::from(array_data))) } #[cfg(test)] diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 0f9414378c4a..7cf41c2f5cab 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -26,8 +26,7 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_data::bit_mask::combine_option_bitmap; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; @@ -1214,8 +1213,7 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], len); + let nulls = NullBuffer::union(left.nulls(), right.nulls()); // we process the data in chunks so that each iteration results in one u64 of comparison result bits const CHUNK_SIZE: usize = 64; @@ -1276,18 +1274,8 @@ where result_remainder.copy_from_slice(remainder_mask_as_bytes); } - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let values = BooleanBuffer::new(result.into(), 0, len); + Ok(BooleanArray::new(values, nulls)) } /// Helper function to perform boolean lambda function on values from an array and a scalar value using @@ -2709,19 +2697,13 @@ where let num_bytes = bit_util::ceil(left_len, 8); - let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); let bool_slice = bool_buf.as_slice_mut(); // if both array slots are valid, check if list contains primitive for i in 0..left_len { - if bit_util::get_bit(not_both_null_bitmap, i) { + if nulls.as_ref().map(|n| n.is_valid(i)).unwrap_or(true) { let list = right.value(i); let list = list.as_any().downcast_ref::>().unwrap(); @@ -2734,18 +2716,8 @@ where } } - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let values = BooleanBuffer::new(bool_buf.into(), 0, left_len); + Ok(BooleanArray::new(values, None)) } /// Checks if a [`GenericListArray`] contains a value in the [`GenericStringArray`] @@ -2766,24 +2738,15 @@ where let num_bytes = bit_util::ceil(left_len, 8); - let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); let bool_slice = &mut bool_buf; for i in 0..left_len { // contains(null, null) = false - if bit_util::get_bit(not_both_null_bitmap, i) { + if nulls.as_ref().map(|n| n.is_valid(i)).unwrap_or(true) { let list = right.value(i); - let list = list - .as_any() - .downcast_ref::>() - .unwrap(); + let list = list.as_string::(); for j in 0..list.len() { if list.is_valid(j) && (left.value(i) == list.value(j)) { @@ -2793,28 +2756,8 @@ where } } } - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -// create a buffer and fill it with valid bits -#[inline] -fn new_all_set_buffer(len: usize) -> Buffer { - let buffer = MutableBuffer::new(len); - let buffer = buffer.with_bitset(len, true); - - buffer.into() + let values = BooleanBuffer::new(bool_buf.into(), 0, left_len); + Ok(BooleanArray::new(values, None)) } // disable wrapping inside literal vectors used for test data and assertions diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 83b58519fdb8..0f1b5202adfa 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -735,13 +735,13 @@ where IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { - let data_ref = values.data_ref(); + let nulls = values.nulls(); let array_iter = indices .values() .iter() .map(|idx| { let idx = maybe_usize::(*idx)?; - if data_ref.is_valid(idx) { + if nulls.map(|n| n.is_valid(idx)).unwrap_or(true) { Ok(Some(values.value(idx))) } else { Ok(None) @@ -768,20 +768,14 @@ where I::Native: ToPrimitive, { let new_keys = take_primitive::(values.keys(), indices)?; - let new_keys_data = new_keys.data_ref(); - - let data = unsafe { - ArrayData::new_unchecked( - values.data_type().clone(), - new_keys.len(), - Some(new_keys_data.null_count()), - new_keys_data.nulls().map(|b| b.inner().sliced()), - 0, - new_keys_data.buffers().to_vec(), - values.data().child_data().to_vec(), - ) - }; - + let builder = new_keys + .into_data() + .into_builder() + .data_type(values.data_type().clone()) + .child_data(vec![values.values().to_data()]); + + // Safety: Indices were valid before + let data = unsafe { builder.build_unchecked() }; Ok(DictionaryArray::::from(data)) } diff --git a/arrow-select/src/window.rs b/arrow-select/src/window.rs index 70ac86857db2..2ad51561c69b 100644 --- a/arrow-select/src/window.rs +++ b/arrow-select/src/window.rs @@ -55,7 +55,7 @@ use num::abs; pub fn shift(array: &dyn Array, offset: i64) -> Result { let value_len = array.len() as i64; if offset == 0 { - Ok(make_array(array.data_ref().clone())) + Ok(make_array(array.to_data())) } else if offset == i64::MIN || abs(offset) >= value_len { Ok(new_null_array(array.data_type(), array.len())) } else { diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 7e45ee7afcda..c17817945dd3 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -16,9 +16,8 @@ // under the License. use arrow::array::{ - make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, - Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, - UInt8Builder, + make_array, Array, BooleanBuilder, Decimal128Builder, Int32Array, Int32Builder, + Int64Array, StringArray, StructBuilder, UInt64Array, }; use arrow_array::Decimal128Array; use arrow_buffer::{ArrowNativeType, Buffer}; @@ -996,28 +995,8 @@ fn test_string_data_from_foreign() { #[test] fn test_decimal_full_validation() { - let values_builder = UInt8Builder::with_capacity(10); - let byte_width = 16; - let mut fixed_size_builder = FixedSizeListBuilder::new(values_builder, byte_width); - let value_as_bytes = 123456_i128.to_le_bytes(); - fixed_size_builder - .values() - .append_slice(value_as_bytes.as_slice()); - fixed_size_builder.append(true); - let fixed_size_array = fixed_size_builder.finish(); - - // Build ArrayData for Decimal - let builder = ArrayData::builder(DataType::Decimal128(5, 3)) - .len(fixed_size_array.len()) - .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); - let array_data = unsafe { builder.build_unchecked() }; - array_data.validate_full().unwrap(); - - let array = Decimal128Array::from(array_data); - let error = array - .validate_decimal_precision(array.precision()) - .unwrap_err(); - + let array = Decimal128Array::from(vec![123456_i128]); + let error = array.validate_decimal_precision(5).unwrap_err(); assert_eq!( "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", error.to_string()