Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added try_new and new to all arrays #873

Merged
merged 4 commits into from
Mar 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/array/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ This document describes the overall design of this module.

* An array with a null bitmap MUST implement it as `Option<Bitmap>`

* An array MUST be `#[derive(Debug, Clone)]`
* An array MUST be `#[derive(Clone)]`

* The trait `Array` MUST only be implemented by structs in this module.

* Every child array on the struct MUST be `Arc<dyn Array>`. This enables the struct to be clonable.

* An array MUST implement `from_data(...) -> Self`. This method MUST panic iff:
* the data does not follow the arrow specification
* the arguments lead to unsound code (e.g. a Utf8 array MUST verify that its each item is valid `utf8`)
* An array MUST implement `try_new(...) -> Self`. This method MUST error iff
the data does not follow the arrow specification, including any sentinel types such as utf8.

* An array MAY implement `unsafe from_data_unchecked` that skips the soundness validation. `from_data_unchecked` MUST panic if the specification is incorrect.
* An array MAY implement `unsafe try_new_unchecked` that skips validation steps that are `O(N)`.

* An array MUST implement either `new_empty()` or `new_empty(DataType)` that returns a zero-len of `Self`.

Expand All @@ -36,7 +35,7 @@ This document describes the overall design of this module.

* functions to create new arrays from native Rust SHOULD be named as follows:
* `from`: from a slice of optional values (e.g. `AsRef<[Option<bool>]` for `BooleanArray`)
* `from_slice`: from a slice of values (e.g. `AsRef<[bool]` for `BooleanArray`)
* `from_slice`: from a slice of values (e.g. `AsRef<[bool]>` for `BooleanArray`)
* `from_trusted_len_iter` from an iterator of trusted len of optional values
* `from_trusted_len_values_iter` from an iterator of trusted len of values
* `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values
Expand Down
171 changes: 120 additions & 51 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::{
};

use super::{
specification::{check_offsets_minimal, try_check_offsets},
specification::{try_check_offsets, try_check_offsets_bounds},
Array, GenericBinaryArray, Offset,
};

Expand All @@ -33,9 +33,77 @@ pub struct BinaryArray<O: Offset> {

// constructors
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`].
///
/// # Errors
/// This function returns an error iff:
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Implementation
/// This function is `O(N)` - checking monotinicity is `O(N)`
pub fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets(&offsets, values.len())?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(ArrowError::oos(
"BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary",
));
}

Ok(Self {
data_type,
offsets,
values,
validity,
})
}

/// Creates a new [`BinaryArray`].
/// # Panics
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Implementation
/// This function is `O(N)` - checking monotinicity is `O(N)`
pub fn new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
}

/// Alias for `new`
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::new(data_type, offsets, values, validity)
}

/// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
pub fn new_empty(data_type: DataType) -> Self {
Self::from_data(
Self::new(
data_type,
Buffer::from(vec![O::zero()]),
Buffer::new(),
Expand All @@ -46,43 +114,45 @@ impl<O: Offset> BinaryArray<O> {
/// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
#[inline]
pub fn new_null(data_type: DataType, length: usize) -> Self {
Self::from_data(
Self::new(
data_type,
Buffer::new_zeroed(length + 1),
Buffer::new(),
Some(Bitmap::new_zeroed(length)),
)
}

/// Creates a new [`BinaryArray`] from lower-level parts
/// # Panics
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn from_data(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(data_type, offsets, values, validity).unwrap()
/// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
pub fn default_data_type() -> DataType {
if O::is_large() {
DataType::LargeBinary
} else {
DataType::Binary
}
}
}

/// Creates a new [`BinaryArray`] from lower-level parts.
// unsafe constructors
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`] without checking for offsets monotinicity.
///
/// # Errors
/// This function returns an error iff:
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn try_new(
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Safety
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
/// # Implementation
/// This function is `O(1)`
pub unsafe fn try_new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets(&offsets, values.len())?;
try_check_offsets_bounds(&offsets, values.len())?;

if validity
.as_ref()
Expand All @@ -107,52 +177,49 @@ impl<O: Offset> BinaryArray<O> {
})
}

/// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
pub fn default_data_type() -> DataType {
if O::is_large() {
DataType::LargeBinary
} else {
DataType::Binary
}
/// Creates a new [`BinaryArray`] without checking for offsets monotinicity.
///
/// # Panics
/// This function returns an error iff:
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
/// # Safety
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
/// # Implementation
/// This function is `O(1)`
pub unsafe fn new_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new_unchecked(data_type, offsets, values, validity).unwrap()
}

/// The same as [`BinaryArray::from_data`] but does not check for offsets.
/// Alias for [`new_unchecked`]
/// # Safety
/// * `offsets` MUST be monotonically increasing
/// # Panics
/// This function panics iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The last element of `offsets` is different from `values.len()`.
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
/// This function is unsafe iff:
/// * the offsets are not monotonically increasing
pub unsafe fn from_data_unchecked(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets_minimal(&offsets, values.len());

if let Some(validity) = &validity {
assert_eq!(offsets.len() - 1, validity.len());
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
}

Self {
data_type,
offsets,
values,
validity,
}
Self::new_unchecked(data_type, offsets, values, validity)
}
}

// must use
impl<O: Offset> BinaryArray<O> {
/// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`].
/// # Implementation
/// This function is `O(1)`: all data will be shared between both arrays.
/// # Panics
/// iff `offset + length > self.len()`.
#[must_use]
pub fn slice(&self, offset: usize, length: usize) -> Self {
assert!(
offset + length <= self.len(),
Expand All @@ -166,6 +233,7 @@ impl<O: Offset> BinaryArray<O> {
/// This function is `O(1)`: all data will be shared between both arrays.
/// # Safety
/// The caller must ensure that `offset + length <= self.len()`.
#[must_use]
pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self {
let validity = self
.validity
Expand All @@ -183,6 +251,7 @@ impl<O: Offset> BinaryArray<O> {
/// Clones this [`BinaryArray`] with a different validity.
/// # Panic
/// Panics iff `validity.len() != self.len()`.
#[must_use]
pub fn with_validity(&self, validity: Option<Bitmap>) -> Self {
if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {
panic!("validity's length must be equal to the array's length")
Expand Down
Loading