Skip to content

Commit

Permalink
Add ByteArray constructors (#3879) (#4081)
Browse files Browse the repository at this point in the history
* Add ByteArray constructors (#3879)

* Clippy

* Make ListArray error message consistent

* Review feedback
  • Loading branch information
tustvold authored Apr 19, 2023
1 parent 2f68c48 commit 1097203
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 29 deletions.
142 changes: 140 additions & 2 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -60,6 +60,87 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Data type of the array.
pub const DATA_TYPE: DataType = T::DATA_TYPE;

/// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`GenericByteArray::try_new`] returns an error
pub fn new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self::try_new(offsets, values, nulls).unwrap()
}

/// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `offsets.len() - 1 != nulls.len()`
/// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
pub fn try_new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1;

// Verify that each pair of offsets is a valid slices of values
T::validate(&offsets, &values)?;

if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for {}{}Array, expected {len} got {}",
T::Offset::PREFIX,
T::PREFIX,
n.len(),
)));
}
}

Ok(Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
})
}

/// Create a new [`GenericByteArray`] from the provided parts, without validation
///
/// # Safety
///
/// Safe if [`Self::try_new`] would not error
pub fn new_unchecked(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
}
}

/// Create a new [`GenericByteArray`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: OffsetBuffer::new_zeroed(len),
value_data: MutableBuffer::new(0).into(),
nulls: Some(NullBuffer::new_null(len)),
}
}

/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
(self.value_offsets, self.value_data, self.nulls)
}

/// Returns the length for value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds.
Expand Down Expand Up @@ -374,3 +455,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
ArrayIter::new(self)
}
}

#[cfg(test)]
mod tests {
use crate::{BinaryArray, StringArray};
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};

#[test]
fn try_new() {
let data = Buffer::from_slice_ref("helloworld");
let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
StringArray::new(offsets.clone(), data.clone(), None);

let nulls = NullBuffer::new_null(3);
let err =
StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone()))
.unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");

let err =
BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");

let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None)
.unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");

BinaryArray::new(offsets, non_utf8_data, None);

let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Offset of 11 exceeds length of values 10"
);

let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Maximum offset of 11 is larger than values of length 10"
);

let non_ascii_data = Buffer::from_slice_ref("heìloworld");
StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
BinaryArray::new(offsets, non_ascii_data.clone(), None);

let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None)
.unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Split UTF-8 codepoint at offset 3"
);

BinaryArray::new(offsets, non_ascii_data, None);
}
}
4 changes: 2 additions & 2 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of nulls for {}ListArray, expected {len} got {}",
"Incorrect length of null buffer for {}ListArray, expected {len} got {}",
OffsetSize::PREFIX,
n.len(),
)));
Expand Down Expand Up @@ -1137,7 +1137,7 @@ mod tests {

assert_eq!(
err.to_string(),
"Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3"
"Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3"
);

let field = Arc::new(Field::new("element", DataType::Int64, false));
Expand Down
28 changes: 4 additions & 24 deletions arrow-array/src/array/string_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
// under the License.

use crate::types::GenericStringType;
use crate::{
Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait,
};
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_buffer::{bit_util, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -105,27 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
pub fn try_from_binary(
v: GenericBinaryArray<OffsetSize>,
) -> Result<Self, ArrowError> {
let offsets = v.value_offsets();
let values = v.value_data();

// We only need to validate that all values are valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::CastError(format!("Encountered non UTF-8 data: {e}"))
})?;

for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
return Err(ArrowError::CastError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
}

let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
// SAFETY:
// Validated UTF-8 above
Ok(Self::from(unsafe { builder.build_unchecked() }))
let (offsets, values, nulls) = v.into_parts();
Self::try_new(offsets, values, nulls)
}
}

Expand Down Expand Up @@ -261,6 +240,7 @@ mod tests {
use super::*;
use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
use crate::types::UInt8Type;
use crate::Array;
use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
Expand Down
52 changes: 51 additions & 1 deletion arrow-array/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
use crate::delta::shift_months;
use crate::{ArrowNativeTypeOp, OffsetSizeTrait};
use arrow_buffer::i256;
use arrow_buffer::{i256, Buffer, OffsetBuffer};
use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision};
use arrow_schema::{
ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
Expand Down Expand Up @@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed {
/// Utf8Array will have native type has &str
/// BinaryArray will have type as [u8]
type Native: bytes::ByteArrayNativeType + AsRef<Self::Native> + AsRef<[u8]> + ?Sized;

/// "Binary" or "String", for use in error messages
const PREFIX: &'static str;

/// Datatype of array elements
const DATA_TYPE: DataType;

/// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values`
fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError>;
}

/// [`ByteArrayType`] for string arrays
Expand All @@ -1547,6 +1555,33 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
} else {
DataType::Utf8
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// Verify that the slice as a whole is valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
})?;

// Verify each offset is at a valid character boundary in this UTF-8 array
for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
if o < validated.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
return Err(ArrowError::InvalidArgumentError(format!(
"Offset of {o} exceeds length of values {}",
validated.len()
)));
}
}
Ok(())
}
}

/// An arrow utf8 array with i32 offsets
Expand All @@ -1569,6 +1604,21 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericBinaryType<O> {
} else {
DataType::Binary
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// offsets are guaranteed to be monotonically increasing and non-empty
let max_offset = offsets.last().unwrap().as_usize();
if values.len() < max_offset {
return Err(ArrowError::InvalidArgumentError(format!(
"Maximum offset of {max_offset} is larger than values of length {}",
values.len()
)));
}
Ok(())
}
}

/// An arrow binary array with i32 offsets
Expand Down

0 comments on commit 1097203

Please sign in to comment.