diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 4b0422cb8ff..be7e5f86a04 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -264,8 +264,12 @@ impl ArrayData { offset, buffers, child_data, + align_buffers: false, + // SAFETY: caller responsible for ensuring data is valid + skip_validation: true, } - .build_unchecked() + .build() + .unwrap() } /// Create a new ArrayData, validating that the provided buffers form a valid @@ -1775,7 +1779,7 @@ impl PartialEq for ArrayData { } } -/// Builder for `ArrayData` type +/// Builder for [`ArrayData`] type #[derive(Debug)] pub struct ArrayDataBuilder { data_type: DataType, @@ -1786,6 +1790,20 @@ pub struct ArrayDataBuilder { offset: usize, buffers: Vec, child_data: Vec, + /// Should buffers be realigned (copying if necessary)? + /// + /// Defaults to false. + align_buffers: bool, + /// Should data validation be skipped for this [`ArrayData`]? + /// + /// Defaults to false. + /// + /// # Safety + /// + /// This flag can only be set to true using `unsafe` APIs. However, once true + /// subsequent calls to `build()` may result in undefined behavior if the data + /// is not valid. + skip_validation: bool, } impl ArrayDataBuilder { @@ -1801,6 +1819,8 @@ impl ArrayDataBuilder { offset: 0, buffers: vec![], child_data: vec![], + align_buffers: false, + skip_validation: false, } } @@ -1877,66 +1897,79 @@ impl ArrayDataBuilder { /// Creates an array data, without any validation /// + /// Note: This is shorthand for `self.with_skip_validation(true).build()` + /// /// # Safety /// /// The same caveats as [`ArrayData::new_unchecked`] /// apply. - #[allow(clippy::let_and_return)] pub unsafe fn build_unchecked(self) -> ArrayData { - let data = self.build_impl(); - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - data.validate_data().unwrap(); - data + self.skip_validation(true).build().unwrap() } - /// Creates an array data, without any validation, - /// but aligning buffers. + /// Creates an `ArrayData`, consuming `self` /// /// # Safety /// - /// The same caveats as [`ArrayData::new_unchecked`] apply. - pub unsafe fn build_aligned_unchecked(self) -> ArrayData { - let mut data = self.build_impl(); - data.align_buffers(); - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - data.validate_data().unwrap(); - data - } + /// By default the underlying buffers are checked to ensure they are valid + /// Arrow data. However, if the [`Self::skip_validation`] flag has been set + /// to true (by the `unsafe` API) this validation is skipped. If the data is + /// not valid, undefined behavior will result. + pub fn build(self) -> Result { + let Self { + data_type, + len, + null_count, + null_bit_buffer, + nulls, + offset, + buffers, + child_data, + align_buffers, + skip_validation, + } = self; - /// Same as [`Self::build_unchecked`] but ignoring `force_validate` feature flag - unsafe fn build_impl(self) -> ArrayData { - let nulls = self - .nulls + let nulls = nulls .or_else(|| { - let buffer = self.null_bit_buffer?; - let buffer = BooleanBuffer::new(buffer, self.offset, self.len); - Some(match self.null_count { - Some(n) => NullBuffer::new_unchecked(buffer, n), + let buffer = null_bit_buffer?; + let buffer = BooleanBuffer::new(buffer, offset, len); + Some(match null_count { + Some(n) => { + // SAFETY: call to `data.validate_data()` below validates the null buffer is valid + unsafe { NullBuffer::new_unchecked(buffer, n) } + } None => NullBuffer::new(buffer), }) }) .filter(|b| b.null_count() != 0); - ArrayData { - data_type: self.data_type, - len: self.len, - offset: self.offset, - buffers: self.buffers, - child_data: self.child_data, + let mut data = ArrayData { + data_type, + len, + offset, + buffers, + child_data, nulls, + }; + + if align_buffers { + data.align_buffers(); } - } - /// Creates an array data, validating all inputs - pub fn build(self) -> Result { - let data = unsafe { self.build_impl() }; - data.validate_data()?; + // SAFETY: `skip_validation` is only set to true using `unsafe` APIs + if !skip_validation || cfg!(feature = "force_validate") { + data.validate_data()?; + } Ok(data) } /// Creates an array data, validating all inputs, and aligning any buffers + #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")] + pub fn build_aligned(self) -> Result { + self.align_buffers(true).build() + } + + /// Ensure that all buffers are aligned, copying data if necessary /// /// Rust requires that arrays are aligned to their corresponding primitive, /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. @@ -1945,17 +1978,33 @@ impl ArrayDataBuilder { /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. /// /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, - /// this method is provided to automatically copy buffers to a new correctly aligned allocation + /// this flag is provided to automatically copy buffers to a new correctly aligned allocation /// when necessary, making it useful when interacting with buffers produced by other systems, /// e.g. IPC or FFI. /// - /// This is unlike `[Self::build`] which will instead return an error on encountering + /// If this flag is not enabled, `[Self::build`] return an error on encountering /// insufficiently aligned buffers. - pub fn build_aligned(self) -> Result { - let mut data = unsafe { self.build_impl() }; - data.align_buffers(); - data.validate_data()?; - Ok(data) + pub fn align_buffers(mut self, align_buffers: bool) -> Self { + self.align_buffers = align_buffers; + self + } + + /// Skips validation of the data. + /// + /// If this flag is enabled, `[Self::build`] will skip validation of the + /// data + /// + /// If this flag is not enabled, `[Self::build`] will validate that all + /// buffers are valid and will return an error if any data is invalid. + /// Validation can be expensive. + /// + /// # Safety + /// + /// If validation is skipped, the buffers must form a valid Arrow array, + /// otherwise undefined behavior will result + pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self { + self.skip_validation = skip_validation; + self } } @@ -1970,6 +2019,8 @@ impl From for ArrayDataBuilder { nulls: d.nulls, null_bit_buffer: None, null_count: None, + align_buffers: false, + skip_validation: false, } } } diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 4705db7759c..62eaab3ebde 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -240,7 +240,7 @@ unsafe fn create_array_internal( let values = create_array_helper(reader, values_field, variadic_counts)?; let run_array_length = run_node.length() as usize; - let builder = ArrayData::builder(data_type.clone()) + let array_data = ArrayData::builder(data_type.clone()) .len(run_array_length) .offset(0) .add_child_data(run_ends.into_data()) @@ -321,7 +321,7 @@ unsafe fn create_array_internal( ))); } - let builder = ArrayData::builder(data_type.clone()) + let array_data = ArrayData::builder(data_type.clone()) .len(length as usize) .offset(0); @@ -539,7 +539,7 @@ unsafe fn create_dictionary_array_internal( ) -> Result { if let Dictionary(_, _) = *data_type { let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); - let builder = ArrayData::builder(data_type.clone()) + let array_data = ArrayData::builder(data_type.clone()) .len(field_node.length() as usize) .add_buffer(buffers[1].clone()) .add_child_data(value_array.into_data())