Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ByteArray constructors (#3879) #4081

Merged
merged 5 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 138 additions & 2 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -60,6 +60,85 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Data type of the array.
pub const DATA_TYPE: DataType = T::DATA_TYPE;

/// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`GenericByteArray::try_new`] returns an error
pub fn new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self::try_new(offsets, values, nulls).unwrap()
}

/// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `offsets.len() - 1 != nulls.len()`
/// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
pub fn try_new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1;
T::validate(&offsets, &values)?;

if let Some(n) = nulls.as_ref() {
if n.len() != len {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to also check the length of values buffer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is covered by T::validate, will add a comment

return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for {}{}Array, expected {len} got {}",
T::Offset::PREFIX,
T::PREFIX,
n.len(),
)));
}
}

Ok(Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
})
}

/// Create a new [`GenericByteArray`] from the provided parts, without validation
///
/// # Safety
///
/// Safe if [`Self::try_new`] would not error
pub fn new_unchecked(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
}
}

/// Create a new [`GenericByteArray`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: OffsetBuffer::new_zeroed(len),
value_data: MutableBuffer::new(0).into(),
nulls: Some(NullBuffer::new_null(len)),
}
}

/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
(self.value_offsets, self.value_data, self.nulls)
}

/// Returns the length for value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds.
Expand Down Expand Up @@ -374,3 +453,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
ArrayIter::new(self)
}
}

#[cfg(test)]
mod tests {
use crate::{BinaryArray, StringArray};
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};

#[test]
fn try_new() {
let data = Buffer::from_slice_ref("helloworld");
let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
StringArray::new(offsets.clone(), data.clone(), None);

let nulls = NullBuffer::new_null(3);
let err =
StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone()))
.unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");

let err =
BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");

let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None)
.unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");

BinaryArray::new(offsets, non_utf8_data, None);

let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Offset of 11 exceeds length of values 10"
);

let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Maximum offset of 11 is larger than values of length 10"
);

let non_ascii_data = Buffer::from_slice_ref("heìloworld");
StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
BinaryArray::new(offsets, non_ascii_data.clone(), None);

let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None)
.unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Split UTF-8 codepoint at offset 3"
);

BinaryArray::new(offsets, non_ascii_data, None);
}
}
4 changes: 2 additions & 2 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of nulls for {}ListArray, expected {len} got {}",
"Incorrect length of null buffer for {}ListArray, expected {len} got {}",
OffsetSize::PREFIX,
n.len(),
)));
Expand Down Expand Up @@ -1137,7 +1137,7 @@ mod tests {

assert_eq!(
err.to_string(),
"Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3"
"Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3"
);

let field = Arc::new(Field::new("element", DataType::Int64, false));
Expand Down
28 changes: 4 additions & 24 deletions arrow-array/src/array/string_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
// under the License.

use crate::types::GenericStringType;
use crate::{
Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait,
};
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_buffer::{bit_util, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -105,27 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
pub fn try_from_binary(
v: GenericBinaryArray<OffsetSize>,
) -> Result<Self, ArrowError> {
let offsets = v.value_offsets();
let values = v.value_data();

// We only need to validate that all values are valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::CastError(format!("Encountered non UTF-8 data: {e}"))
})?;

for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
return Err(ArrowError::CastError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
}

let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
// SAFETY:
// Validated UTF-8 above
Ok(Self::from(unsafe { builder.build_unchecked() }))
let (offsets, values, nulls) = v.into_parts();
Self::try_new(offsets, values, nulls)
}
}

Expand Down Expand Up @@ -261,6 +240,7 @@ mod tests {
use super::*;
use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
use crate::types::UInt8Type;
use crate::Array;
use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
Expand Down
52 changes: 51 additions & 1 deletion arrow-array/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

use crate::delta::shift_months;
use crate::{ArrowNativeTypeOp, OffsetSizeTrait};
use arrow_buffer::i256;
use arrow_buffer::{i256, Buffer, OffsetBuffer};
use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision};
use arrow_schema::{
ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
Expand Down Expand Up @@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed {
/// Utf8Array will have native type has &str
/// BinaryArray will have type as [u8]
type Native: bytes::ByteArrayNativeType + AsRef<Self::Native> + AsRef<[u8]> + ?Sized;

/// "Binary" or "String", for use in error messages
const PREFIX: &'static str;

/// Datatype of array elements
const DATA_TYPE: DataType;

/// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values`
fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError>;
}

/// [`ByteArrayType`] for string arrays
Expand All @@ -1547,6 +1555,33 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
} else {
DataType::Utf8
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// Verify that the slice as a whole is valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
})?;

// Verify each offset is at a valid character boundary in this UTF-8 array
for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
if o < validated.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
return Err(ArrowError::InvalidArgumentError(format!(
"Offset of {o} exceeds length of values {}",
validated.len()
)));
}
}
Ok(())
}
}

/// An arrow utf8 array with i32 offsets
Expand All @@ -1569,6 +1604,21 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericBinaryType<O> {
} else {
DataType::Binary
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// offsets are guaranteed to be monotonically increasing and non-empty
let max_offset = offsets.last().unwrap().as_usize();
if values.len() < max_offset {
return Err(ArrowError::InvalidArgumentError(format!(
"Maximum offset of {max_offset} is larger than values of length {}",
values.len()
Comment on lines +1616 to +1617
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Maximum offset of {max_offset} is larger than values of length {}",
values.len()
"Maximum offset of {max_offset} is larger than length of values {}",
values.len()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the phrasing before is grammatically more correct

)));
}
Ok(())
}
}

/// An arrow binary array with i32 offsets
Expand Down