diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index e23079ef9be9..12f9aab674e8 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -21,10 +21,10 @@ use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -60,6 +60,87 @@ impl GenericByteArray { /// Data type of the array. pub const DATA_TYPE: DataType = T::DATA_TYPE; + /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`GenericByteArray::try_new`] returns an error + pub fn new( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Self { + Self::try_new(offsets, values, nulls).unwrap() + } + + /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `offsets.len() - 1 != nulls.len()` + /// * Any consecutive pair of `offsets` does not denote a valid slice of `values` + pub fn try_new( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Result { + let len = offsets.len() - 1; + + // Verify that each pair of offsets is a valid slices of values + T::validate(&offsets, &values)?; + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}{}Array, expected {len} got {}", + T::Offset::PREFIX, + T::PREFIX, + n.len(), + ))); + } + } + + Ok(Self { + data_type: T::DATA_TYPE, + value_offsets: offsets, + value_data: values, + nulls, + }) + } + + /// Create a new [`GenericByteArray`] from the provided parts, without validation + /// + /// # Safety + /// + /// Safe if [`Self::try_new`] would not error + pub fn new_unchecked( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Self { + Self { + data_type: T::DATA_TYPE, + value_offsets: offsets, + value_data: values, + nulls, + } + } + + /// Create a new [`GenericByteArray`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + value_offsets: OffsetBuffer::new_zeroed(len), + value_data: MutableBuffer::new(0).into(), + nulls: Some(NullBuffer::new_null(len)), + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (OffsetBuffer, Buffer, Option) { + (self.value_offsets, self.value_data, self.nulls) + } + /// Returns the length for value at index `i`. /// # Panics /// Panics if index `i` is out of bounds. @@ -374,3 +455,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray { ArrayIter::new(self) } } + +#[cfg(test)] +mod tests { + use crate::{BinaryArray, StringArray}; + use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; + + #[test] + fn try_new() { + let data = Buffer::from_slice_ref("helloworld"); + let offsets = OffsetBuffer::new(vec![0, 5, 10].into()); + StringArray::new(offsets.clone(), data.clone(), None); + + let nulls = NullBuffer::new_null(3); + let err = + StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"); + + let err = + BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"); + + let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld"); + let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"); + + BinaryArray::new(offsets, non_utf8_data, None); + + let offsets = OffsetBuffer::new(vec![0, 5, 11].into()); + let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Offset of 11 exceeds length of values 10" + ); + + let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Maximum offset of 11 is larger than values of length 10" + ); + + let non_ascii_data = Buffer::from_slice_ref("heìloworld"); + StringArray::new(offsets.clone(), non_ascii_data.clone(), None); + BinaryArray::new(offsets, non_ascii_data.clone(), None); + + let offsets = OffsetBuffer::new(vec![0, 3, 10].into()); + let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None) + .unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Split UTF-8 codepoint at offset 3" + ); + + BinaryArray::new(offsets, non_ascii_data, None); + } +} diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 7f6f54e4ccc3..f4e5b4b79c77 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -109,7 +109,7 @@ impl GenericListArray { if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( - "Incorrect number of nulls for {}ListArray, expected {len} got {}", + "Incorrect length of null buffer for {}ListArray, expected {len} got {}", OffsetSize::PREFIX, n.len(), ))); @@ -1137,7 +1137,7 @@ mod tests { assert_eq!( err.to_string(), - "Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3" + "Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3" ); let field = Arc::new(Field::new("element", DataType::Int64, false)); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index e042f29c22d1..7c4a375299db 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -16,9 +16,7 @@ // under the License. use crate::types::GenericStringType; -use crate::{ - Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, -}; +use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -105,27 +103,8 @@ impl GenericStringArray { pub fn try_from_binary( v: GenericBinaryArray, ) -> Result { - let offsets = v.value_offsets(); - let values = v.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values).map_err(|e| { - ArrowError::CastError(format!("Encountered non UTF-8 data: {e}")) - })?; - - for offset in offsets.iter() { - let o = offset.as_usize(); - if !validated.is_char_boundary(o) { - return Err(ArrowError::CastError(format!( - "Split UTF-8 codepoint at offset {o}" - ))); - } - } - - let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - // SAFETY: - // Validated UTF-8 above - Ok(Self::from(unsafe { builder.build_unchecked() })) + let (offsets, values, nulls) = v.into_parts(); + Self::try_new(offsets, values, nulls) } } @@ -261,6 +240,7 @@ mod tests { use super::*; use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder}; use crate::types::UInt8Type; + use crate::Array; use arrow_buffer::Buffer; use arrow_schema::Field; use std::sync::Arc; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index cec78db9fc70..b50018ca9751 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -19,7 +19,7 @@ use crate::delta::shift_months; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; -use arrow_buffer::i256; +use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; use arrow_schema::{ ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, @@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { /// Utf8Array will have native type has &str /// BinaryArray will have type as [u8] type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; + /// "Binary" or "String", for use in error messages const PREFIX: &'static str; + /// Datatype of array elements const DATA_TYPE: DataType; + + /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values` + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError>; } /// [`ByteArrayType`] for string arrays @@ -1547,6 +1555,33 @@ impl ByteArrayType for GenericStringType { } else { DataType::Utf8 }; + + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError> { + // Verify that the slice as a whole is valid UTF-8 + let validated = std::str::from_utf8(values).map_err(|e| { + ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) + })?; + + // Verify each offset is at a valid character boundary in this UTF-8 array + for offset in offsets.iter() { + let o = offset.as_usize(); + if !validated.is_char_boundary(o) { + if o < validated.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Split UTF-8 codepoint at offset {o}" + ))); + } + return Err(ArrowError::InvalidArgumentError(format!( + "Offset of {o} exceeds length of values {}", + validated.len() + ))); + } + } + Ok(()) + } } /// An arrow utf8 array with i32 offsets @@ -1569,6 +1604,21 @@ impl ByteArrayType for GenericBinaryType { } else { DataType::Binary }; + + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError> { + // offsets are guaranteed to be monotonically increasing and non-empty + let max_offset = offsets.last().unwrap().as_usize(); + if values.len() < max_offset { + return Err(ArrowError::InvalidArgumentError(format!( + "Maximum offset of {max_offset} is larger than values of length {}", + values.len() + ))); + } + Ok(()) + } } /// An arrow binary array with i32 offsets