Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added try_new and new to all arrays (#873)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Mar 1, 2022
1 parent 5d0db54 commit eb4bc5d
Show file tree
Hide file tree
Showing 14 changed files with 1,038 additions and 370 deletions.
11 changes: 5 additions & 6 deletions src/array/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ This document describes the overall design of this module.

* An array with a null bitmap MUST implement it as `Option<Bitmap>`

* An array MUST be `#[derive(Debug, Clone)]`
* An array MUST be `#[derive(Clone)]`

* The trait `Array` MUST only be implemented by structs in this module.

* Every child array on the struct MUST be `Arc<dyn Array>`. This enables the struct to be clonable.

* An array MUST implement `from_data(...) -> Self`. This method MUST panic iff:
* the data does not follow the arrow specification
* the arguments lead to unsound code (e.g. a Utf8 array MUST verify that its each item is valid `utf8`)
* An array MUST implement `try_new(...) -> Self`. This method MUST error iff
the data does not follow the arrow specification, including any sentinel types such as utf8.

* An array MAY implement `unsafe from_data_unchecked` that skips the soundness validation. `from_data_unchecked` MUST panic if the specification is incorrect.
* An array MAY implement `unsafe try_new_unchecked` that skips validation steps that are `O(N)`.

* An array MUST implement either `new_empty()` or `new_empty(DataType)` that returns a zero-len of `Self`.

Expand All @@ -36,7 +35,7 @@ This document describes the overall design of this module.

* functions to create new arrays from native Rust SHOULD be named as follows:
* `from`: from a slice of optional values (e.g. `AsRef<[Option<bool>]` for `BooleanArray`)
* `from_slice`: from a slice of values (e.g. `AsRef<[bool]` for `BooleanArray`)
* `from_slice`: from a slice of values (e.g. `AsRef<[bool]>` for `BooleanArray`)
* `from_trusted_len_iter` from an iterator of trusted len of optional values
* `from_trusted_len_values_iter` from an iterator of trusted len of values
* `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values
Expand Down
171 changes: 120 additions & 51 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::{
};

use super::{
specification::{check_offsets_minimal, try_check_offsets},
specification::{try_check_offsets, try_check_offsets_bounds},
Array, GenericBinaryArray, Offset,
};

Expand All @@ -33,9 +33,77 @@ pub struct BinaryArray<O: Offset> {

// constructors
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`].
///
/// # Errors
/// This function returns an error iff:
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Implementation
/// This function is `O(N)` - checking monotinicity is `O(N)`
pub fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets(&offsets, values.len())?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(ArrowError::oos(
"BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary",
));
}

Ok(Self {
data_type,
offsets,
values,
validity,
})
}

/// Creates a new [`BinaryArray`].
/// # Panics
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Implementation
/// This function is `O(N)` - checking monotinicity is `O(N)`
pub fn new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
}

/// Alias for `new`
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::new(data_type, offsets, values, validity)
}

/// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
pub fn new_empty(data_type: DataType) -> Self {
Self::from_data(
Self::new(
data_type,
Buffer::from(vec![O::zero()]),
Buffer::new(),
Expand All @@ -46,43 +114,45 @@ impl<O: Offset> BinaryArray<O> {
/// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
#[inline]
pub fn new_null(data_type: DataType, length: usize) -> Self {
Self::from_data(
Self::new(
data_type,
Buffer::new_zeroed(length + 1),
Buffer::new(),
Some(Bitmap::new_zeroed(length)),
)
}

/// Creates a new [`BinaryArray`] from lower-level parts
/// # Panics
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
/// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
pub fn default_data_type() -> DataType {
if O::is_large() {
DataType::LargeBinary
} else {
DataType::Binary
}
}
}

/// Creates a new [`BinaryArray`] from lower-level parts.
// unsafe constructors
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`] without checking for offsets monotinicity.
///
/// # Errors
/// This function returns an error iff:
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn try_new(
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Safety
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
/// # Implementation
/// This function is `O(1)`
pub unsafe fn try_new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets(&offsets, values.len())?;
try_check_offsets_bounds(&offsets, values.len())?;

if validity
.as_ref()
Expand All @@ -107,52 +177,49 @@ impl<O: Offset> BinaryArray<O> {
})
}

/// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
pub fn default_data_type() -> DataType {
if O::is_large() {
DataType::LargeBinary
} else {
DataType::Binary
}
/// Creates a new [`BinaryArray`] without checking for offsets monotinicity.
///
/// # Panics
/// This function returns an error iff:
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Safety
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
/// # Implementation
/// This function is `O(1)`
pub unsafe fn new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new_unchecked(data_type, offsets, values, validity).unwrap()
}

/// The same as [`BinaryArray::from_data`] but does not check for offsets.
/// Alias for [`new_unchecked`]
/// # Safety
/// * `offsets` MUST be monotonically increasing
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The last element of `offsets` is different from `values.len()`.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets_minimal(&offsets, values.len());

if let Some(validity) = &validity {
assert_eq!(offsets.len() - 1, validity.len());
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
}

Self {
data_type,
offsets,
values,
validity,
}
Self::new_unchecked(data_type, offsets, values, validity)
}
}

// must use
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`].
/// # Implementation
/// This function is `O(1)`: all data will be shared between both arrays.
/// # Panics
/// iff `offset + length > self.len()`.
#[must_use]
pub fn slice(&self, offset: usize, length: usize) -> Self {
assert!(
offset + length <= self.len(),
Expand All @@ -166,6 +233,7 @@ impl<O: Offset> BinaryArray<O> {
/// This function is `O(1)`: all data will be shared between both arrays.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
#[must_use]
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
Expand All @@ -183,6 +251,7 @@ impl<O: Offset> BinaryArray<O> {
/// Clones this [`BinaryArray`] with a different validity.
/// # Panic
/// Panics iff `validity.len() != self.len()`.
#[must_use]
pub fn with_validity(&self, validity: Option<Bitmap>) -> Self {
if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {
panic!("validity's length must be equal to the array's length")
Expand Down
Loading

0 comments on commit eb4bc5d

Please sign in to comment.