diff --git a/src/array/README.md b/src/array/README.md index a814839527c..ca5376c214c 100644 --- a/src/array/README.md +++ b/src/array/README.md @@ -16,17 +16,16 @@ This document describes the overall design of this module. * An array with a null bitmap MUST implement it as `Option` -* An array MUST be `#[derive(Debug, Clone)]` +* An array MUST be `#[derive(Clone)]` * The trait `Array` MUST only be implemented by structs in this module. * Every child array on the struct MUST be `Arc`. This enables the struct to be clonable. -* An array MUST implement `from_data(...) -> Self`. This method MUST panic iff: - * the data does not follow the arrow specification - * the arguments lead to unsound code (e.g. a Utf8 array MUST verify that its each item is valid `utf8`) +* An array MUST implement `try_new(...) -> Self`. This method MUST error iff + the data does not follow the arrow specification, including any sentinel types such as utf8. -* An array MAY implement `unsafe from_data_unchecked` that skips the soundness validation. `from_data_unchecked` MUST panic if the specification is incorrect. +* An array MAY implement `unsafe try_new_unchecked` that skips validation steps that are `O(N)`. * An array MUST implement either `new_empty()` or `new_empty(DataType)` that returns a zero-len of `Self`. @@ -36,7 +35,7 @@ This document describes the overall design of this module. * functions to create new arrays from native Rust SHOULD be named as follows: * `from`: from a slice of optional values (e.g. `AsRef<[Option]` for `BooleanArray`) - * `from_slice`: from a slice of values (e.g. `AsRef<[bool]` for `BooleanArray`) + * `from_slice`: from a slice of values (e.g. `AsRef<[bool]>` for `BooleanArray`) * `from_trusted_len_iter` from an iterator of trusted len of optional values * `from_trusted_len_values_iter` from an iterator of trusted len of values * `try_from_trusted_len_iter` from an fallible iterator of trusted len of optional values diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 437854eba4c..934ff811f67 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -6,7 +6,7 @@ use crate::{ }; use super::{ - specification::{check_offsets_minimal, try_check_offsets}, + specification::{try_check_offsets, try_check_offsets_bounds}, Array, GenericBinaryArray, Offset, }; @@ -33,9 +33,77 @@ pub struct BinaryArray { // constructors impl BinaryArray { + /// Creates a new [`BinaryArray`]. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn try_new( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Result { + try_check_offsets(&offsets, values.len())?; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(ArrowError::oos( + "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary", + )); + } + + Ok(Self { + data_type, + offsets, + values, + validity, + }) + } + + /// Creates a new [`BinaryArray`]. + /// # Panics + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn new( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, values, validity) + } + /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. pub fn new_empty(data_type: DataType) -> Self { - Self::from_data( + Self::new( data_type, Buffer::from(vec![O::zero()]), Buffer::new(), @@ -46,7 +114,7 @@ impl BinaryArray { /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`. #[inline] pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::from_data( + Self::new( data_type, Buffer::new_zeroed(length + 1), Buffer::new(), @@ -54,35 +122,37 @@ impl BinaryArray { ) } - /// Creates a new [`BinaryArray`] from lower-level parts - /// # Panics - /// * the offsets are not monotonically increasing - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`. - pub fn from_data( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new(data_type, offsets, values, validity).unwrap() + /// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary` + pub fn default_data_type() -> DataType { + if O::is_large() { + DataType::LargeBinary + } else { + DataType::Binary + } } +} - /// Creates a new [`BinaryArray`] from lower-level parts. +// unsafe constructors +impl BinaryArray { + /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. /// + /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`. - pub fn try_new( + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// # Implementation + /// This function is `O(1)` + pub unsafe fn try_new_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() @@ -107,52 +177,49 @@ impl BinaryArray { }) } - /// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary` - pub fn default_data_type() -> DataType { - if O::is_large() { - DataType::LargeBinary - } else { - DataType::Binary - } + /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. + /// + /// # Panics + /// This function returns an error iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() } - /// The same as [`BinaryArray::from_data`] but does not check for offsets. + /// Alias for [`new_unchecked`] /// # Safety - /// * `offsets` MUST be monotonically increasing - /// # Panics - /// This function panics iff: - /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The last element of `offsets` is different from `values.len()`. - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing pub unsafe fn from_data_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Self { - check_offsets_minimal(&offsets, values.len()); - - if let Some(validity) = &validity { - assert_eq!(offsets.len() - 1, validity.len()); - } - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary") - } - - Self { - data_type, - offsets, - values, - validity, - } + Self::new_unchecked(data_type, offsets, values, validity) } +} +// must use +impl BinaryArray { /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. /// # Implementation /// This function is `O(1)`: all data will be shared between both arrays. /// # Panics /// iff `offset + length > self.len()`. + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -166,6 +233,7 @@ impl BinaryArray { /// This function is `O(1)`: all data will be shared between both arrays. /// # Safety /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -183,6 +251,7 @@ impl BinaryArray { /// Clones this [`BinaryArray`] with a different validity. /// # Panic /// Panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity's length must be equal to the array's length") diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 0c5126a2707..4ee5ccfa33e 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -1,6 +1,7 @@ use crate::{ bitmap::Bitmap, datatypes::{DataType, PhysicalType}, + error::ArrowError, }; use either::Either; @@ -25,6 +26,52 @@ pub struct BooleanArray { } impl BooleanArray { + /// The canonical method to create a [`BooleanArray`] out of low-end APIs. + /// # Errors + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Boolean`]. + pub fn try_new( + data_type: DataType, + values: Bitmap, + validity: Option, + ) -> Result { + if validity + .as_ref() + .map_or(false, |validity| validity.len() != values.len()) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + if data_type.to_physical_type() != PhysicalType::Boolean { + return Err(ArrowError::oos( + "BooleanArray can only be initialized with a DataType whose physical type is Boolean", + )); + } + + Ok(Self { + data_type, + values, + validity, + }) + } + + /// The canonical method to create a [`BooleanArray`] + /// # Panics + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Boolean`]. + pub fn new(data_type: DataType, values: Bitmap, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data(data_type: DataType, values: Bitmap, validity: Option) -> Self { + Self::new(data_type, values, validity) + } + /// Returns a new empty [`BooleanArray`]. pub fn new_empty(data_type: DataType) -> Self { Self::from_data(data_type, Bitmap::new(), None) @@ -35,31 +82,17 @@ impl BooleanArray { let bitmap = Bitmap::new_zeroed(length); Self::from_data(data_type, bitmap.clone(), Some(bitmap)) } +} - /// The canonical method to create a [`BooleanArray`] out of low-end APIs. - /// # Panics - /// This function panics iff: - /// * The validity is not `None` and its length is different from `values`'s length - pub fn from_data(data_type: DataType, values: Bitmap, validity: Option) -> Self { - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - if data_type.to_physical_type() != PhysicalType::Boolean { - panic!("BooleanArray can only be initialized with DataType::Boolean") - } - Self { - data_type, - values, - validity, - } - } - +// must use +impl BooleanArray { /// Returns a slice of this [`BooleanArray`]. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. /// # Panic /// This function panics iff `offset + length >= self.len()`. #[inline] + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -74,6 +107,7 @@ impl BooleanArray { /// # Safety /// The caller must ensure that `offset + length <= self.len()`. #[inline] + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -89,6 +123,7 @@ impl BooleanArray { /// Sets the validity bitmap on this [`BooleanArray`]. /// # Panic /// This function panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") @@ -98,7 +133,7 @@ impl BooleanArray { arr } - /// Try to convert this `BooleanArray` to a `MutableBooleanArray` + /// Try to convert this [`BooleanArray`] to a [`MutableBooleanArray`] pub fn into_mut(self) -> Either { use Either::*; diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index f796340c93a..935fbd4ac54 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,4 +1,4 @@ -use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; +use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::ArrowError}; use super::Array; @@ -19,6 +19,61 @@ pub struct FixedSizeBinaryArray { } impl FixedSizeBinaryArray { + /// Creates a new [`FixedSizeBinaryArray`]. + /// + /// # Errors + /// This function returns an error iff: + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::FixedSizeBinary`] + /// * The length of `values` is not a multiple of `size` in `data_type` + /// * the validity's length is not equal to `values.len() / size`. + pub fn try_new( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Result { + let size = Self::maybe_get_size(&data_type)?; + + if values.len() % size != 0 { + return Err(ArrowError::oos(format!( + "values (of len {}) must be a multiple of size ({}) in FixedSizeBinaryArray.", + values.len(), + size + ))); + } + let len = values.len() / size; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != len) + { + return Err(ArrowError::oos( + "validity mask length must be equal to the number of values divided by size", + )); + } + + Ok(Self { + size, + data_type, + values, + validity, + }) + } + + /// Creates a new [`FixedSizeBinaryArray`]. + /// # Panics + /// This function panics iff: + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::FixedSizeBinary`] + /// * The length of `values` is not a multiple of `size` in `data_type` + /// * the validity's length is not equal to `values.len() / size`. + pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::new(data_type, values, validity) + } + /// Returns a new empty [`FixedSizeBinaryArray`]. pub fn new_empty(data_type: DataType) -> Self { Self::from_data(data_type, Buffer::new(), None) @@ -32,30 +87,16 @@ impl FixedSizeBinaryArray { Some(Bitmap::new_zeroed(length)), ) } +} - /// Returns a new [`FixedSizeBinaryArray`]. - pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { - let size = Self::get_size(&data_type); - - assert_eq!(values.len() % size, 0); - - if let Some(ref validity) = validity { - assert_eq!(values.len() / size, validity.len()); - } - - Self { - size, - data_type, - values, - validity, - } - } - +// must use +impl FixedSizeBinaryArray { /// Returns a slice of this [`FixedSizeBinaryArray`]. /// # Implementation /// This operation is `O(1)` as it amounts to increase 3 ref counts. /// # Panics /// panics iff `offset + length > self.len()` + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -69,6 +110,7 @@ impl FixedSizeBinaryArray { /// This operation is `O(1)` as it amounts to increase 3 ref counts. /// # Safety /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -89,6 +131,7 @@ impl FixedSizeBinaryArray { /// Sets the validity bitmap on this [`FixedSizeBinaryArray`]. /// # Panic /// This function panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") @@ -167,12 +210,18 @@ impl FixedSizeBinaryArray { } impl FixedSizeBinaryArray { - pub(crate) fn get_size(data_type: &DataType) -> usize { + pub(crate) fn maybe_get_size(data_type: &DataType) -> Result { match data_type.to_logical_type() { - DataType::FixedSizeBinary(size) => *size, - _ => panic!("Wrong DataType"), + DataType::FixedSizeBinary(size) => Ok(*size), + _ => Err(ArrowError::oos( + "FixedSizeBinaryArray expects DataType::FixedSizeBinary", + )), } } + + pub(crate) fn get_size(data_type: &DataType) -> usize { + Self::maybe_get_size(data_type).unwrap() + } } impl Array for FixedSizeBinaryArray { @@ -211,7 +260,7 @@ impl FixedSizeBinaryArray { pub fn try_from_iter, I: IntoIterator>>( iter: I, size: usize, - ) -> Result { + ) -> Result { MutableFixedSizeBinaryArray::try_from_iter(iter, size).map(|x| x.into()) } diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 197bad87d5e..61ac7970f88 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use crate::{ bitmap::Bitmap, datatypes::{DataType, Field}, + error::ArrowError, }; use super::{new_empty_array, new_null_array, Array}; @@ -25,6 +26,75 @@ pub struct FixedSizeListArray { } impl FixedSizeListArray { + /// Creates a new [`FixedSizeListArray`]. + /// + /// # Errors + /// This function returns an error iff: + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::FixedSizeList`] + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// * The length of `values` is not a multiple of `size` in `data_type` + /// * the validity's length is not equal to `values.len() / size`. + pub fn try_new( + data_type: DataType, + values: Arc, + validity: Option, + ) -> Result { + let (child, size) = Self::try_child_and_size(&data_type)?; + + let child_data_type = &child.data_type; + let values_data_type = values.data_type(); + if child_data_type != values_data_type { + return Err(ArrowError::oos( + format!("FixedSizeListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), + )); + } + + if values.len() % size != 0 { + return Err(ArrowError::oos(format!( + "values (of len {}) must be a multiple of size ({}) in FixedSizeListArray.", + values.len(), + size + ))); + } + let len = values.len() / size; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != len) + { + return Err(ArrowError::oos( + "validity mask length must be equal to the number of values divided by size", + )); + } + + Ok(Self { + size, + data_type, + values, + validity, + }) + } + + /// Creates a new [`FixedSizeListArray`]. + /// # Panics + /// This function panics iff: + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::FixedSizeList`] + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// * The length of `values` is not a multiple of `size` in `data_type` + /// * the validity's length is not equal to `values.len() / size`. + pub fn new(data_type: DataType, values: Arc, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + values: Arc, + validity: Option, + ) -> Self { + Self::new(data_type, values, validity) + } + /// Returns a new empty [`FixedSizeListArray`]. pub fn new_empty(data_type: DataType) -> Self { let values = @@ -41,34 +111,16 @@ impl FixedSizeListArray { .into(); Self::from_data(data_type, values, Some(Bitmap::new_zeroed(length))) } +} - /// Returns a [`FixedSizeListArray`]. - pub fn from_data( - data_type: DataType, - values: Arc, - validity: Option, - ) -> Self { - let (_, size) = Self::get_child_and_size(&data_type); - - assert_eq!(values.len() % size, 0); - - if let Some(ref validity) = validity { - assert_eq!(values.len() / size, validity.len()); - } - - Self { - size, - data_type, - values, - validity, - } - } - +// must use +impl FixedSizeListArray { /// Returns a slice of this [`FixedSizeListArray`]. /// # Implementation /// This operation is `O(1)`. /// # Panics /// panics iff `offset + length > self.len()` + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -82,6 +134,7 @@ impl FixedSizeListArray { /// This operation is `O(1)`. /// # Safety /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -103,6 +156,7 @@ impl FixedSizeListArray { /// Sets the validity bitmap on this [`FixedSizeListArray`]. /// # Panic /// This function panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") @@ -152,13 +206,19 @@ impl FixedSizeListArray { } impl FixedSizeListArray { - pub(crate) fn get_child_and_size(data_type: &DataType) -> (&Field, usize) { + pub(crate) fn try_child_and_size(data_type: &DataType) -> Result<(&Field, usize), ArrowError> { match data_type.to_logical_type() { - DataType::FixedSizeList(child, size) => (child.as_ref(), *size as usize), - _ => panic!("FixedSizeListArray expects DataType::FixedSizeList"), + DataType::FixedSizeList(child, size) => Ok((child.as_ref(), *size as usize)), + _ => Err(ArrowError::oos( + "FixedSizeListArray expects DataType::FixedSizeList", + )), } } + pub(crate) fn get_child_and_size(data_type: &DataType) -> (&Field, usize) { + Self::try_child_and_size(data_type).unwrap() + } + /// Returns a [`DataType`] consistent with [`FixedSizeListArray`]. pub fn default_datatype(data_type: DataType, size: usize) -> DataType { let field = Box::new(Field::new("item", data_type, true)); diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 4da1c2835af..b34ce2b0c32 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -4,9 +4,14 @@ use crate::{ bitmap::Bitmap, buffer::Buffer, datatypes::{DataType, Field}, + error::ArrowError, }; -use super::{new_empty_array, specification::check_offsets, Array, Offset}; +use super::{ + new_empty_array, + specification::{try_check_offsets, try_check_offsets_bounds}, + Array, Offset, +}; mod ffi; pub(super) mod fmt; @@ -25,58 +30,171 @@ pub struct ListArray { } impl ListArray { + /// Creates a new [`ListArray`]. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn try_new( + data_type: DataType, + offsets: Buffer, + values: Arc, + validity: Option, + ) -> Result { + try_check_offsets(&offsets, values.len())?; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + let child_data_type = Self::try_get_child(&data_type)?.data_type(); + let values_data_type = values.data_type(); + if child_data_type != values_data_type { + return Err(ArrowError::oos( + format!("ListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), + )); + } + + Ok(Self { + data_type, + offsets, + values, + validity, + }) + } + + /// Creates a new [`ListArray`]. + /// + /// # Panics + /// This function panics iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn new( + data_type: DataType, + offsets: Buffer, + values: Arc, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, values, validity).unwrap() + } + + /// Alias of `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Arc, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, values, validity) + } + /// Returns a new empty [`ListArray`]. pub fn new_empty(data_type: DataType) -> Self { let values = new_empty_array(Self::get_child_type(&data_type).clone()).into(); - Self::from_data(data_type, Buffer::from(vec![O::zero()]), values, None) + Self::new(data_type, Buffer::from(vec![O::zero()]), values, None) } /// Returns a new null [`ListArray`]. #[inline] pub fn new_null(data_type: DataType, length: usize) -> Self { let child = Self::get_child_type(&data_type).clone(); - Self::from_data( + Self::new( data_type, Buffer::new_zeroed(length + 1), new_empty_array(child).into(), Some(Bitmap::new_zeroed(length)), ) } +} - /// Returns a new [`ListArray`]. - /// # Panic - /// This function panics iff: - /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The `offsets` and `values` are inconsistent - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. - pub fn from_data( +// unsafe construtors +impl ListArray { + /// Creates a new [`ListArray`]. + /// + /// # Errors + /// This function returns an error iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// # Implementation + /// This function is `O(1)` + pub unsafe fn try_new_unchecked( data_type: DataType, offsets: Buffer, values: Arc, validity: Option, - ) -> Self { - check_offsets(&offsets, values.len()); + ) -> Result { + try_check_offsets_bounds(&offsets, values.len())?; - if let Some(ref validity) = validity { - assert_eq!(offsets.len() - 1, validity.len()); + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); } - // validate data_type - let child_data_type = Self::get_child_type(&data_type); - assert_eq!( - child_data_type, - values.data_type(), - "The child's datatype must match the inner type of the \'data_type\'" - ); + let child_data_type = Self::try_get_child(&data_type)?.data_type(); + let values_data_type = values.data_type(); + if child_data_type != values_data_type { + return Err(ArrowError::oos( + format!("ListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), + )); + } - Self { + Ok(Self { data_type, offsets, values, validity, - } + }) + } + + /// Creates a new [`ListArray`]. + /// + /// # Panics + /// This function panics iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. + /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: Buffer, + values: Arc, + validity: Option, + ) -> Self { + Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() } +} +impl ListArray { /// Returns a slice of this [`ListArray`]. /// # Panics /// panics iff `offset + length >= self.len()` @@ -185,15 +303,24 @@ impl ListArray { /// # Panics /// Panics iff the logical type is not consistent with this struct. pub fn get_child_field(data_type: &DataType) -> &Field { + Self::try_get_child(data_type).unwrap() + } + + /// Returns a the inner [`Field`] + /// # Errors + /// Panics iff the logical type is not consistent with this struct. + fn try_get_child(data_type: &DataType) -> Result<&Field, ArrowError> { if O::is_large() { match data_type.to_logical_type() { - DataType::LargeList(child) => child.as_ref(), - _ => panic!("ListArray expects DataType::List or DataType::LargeList"), + DataType::LargeList(child) => Ok(child.as_ref()), + _ => Err(ArrowError::oos( + "ListArray expects DataType::LargeList", + )), } } else { match data_type.to_logical_type() { - DataType::List(child) => child.as_ref(), - _ => panic!("ListArray expects DataType::List or DataType::List"), + DataType::List(child) => Ok(child.as_ref()), + _ => Err(ArrowError::oos("ListArray expects DataType::List")), } } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index dfac80b6f0e..a8224051b41 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -4,9 +4,10 @@ use crate::{ bitmap::Bitmap, buffer::Buffer, datatypes::{DataType, Field}, + error::ArrowError, }; -use super::{new_empty_array, specification::check_offsets, Array}; +use super::{new_empty_array, specification::try_check_offsets, Array}; mod ffi; mod iterator; @@ -24,12 +25,80 @@ pub struct MapArray { } impl MapArray { - pub(crate) fn get_field(datatype: &DataType) -> &Field { - if let DataType::Map(field, _) = datatype.to_logical_type() { - field.as_ref() + /// Returns a new [`MapArray`]. + /// # Errors + /// This function errors iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the field' length + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`] + /// * The fields' `data_type` is not equal to the inner field of `data_type` + /// * The validity is not `None` and its length is different from `offsets.len() - 1`. + pub fn try_new( + data_type: DataType, + offsets: Buffer, + field: Arc, + validity: Option, + ) -> Result { + try_check_offsets(&offsets, field.len())?; + + let inner_field = Self::try_get_field(&data_type)?; + if let DataType::Struct(inner) = inner_field.data_type() { + if inner.len() != 2 { + return Err(ArrowError::InvalidArgumentError( + "MapArray's inner `Struct` must have 2 fields (keys and maps)".to_string(), + )); + } } else { - panic!("MapArray expects `DataType::Map` logical type") + return Err(ArrowError::InvalidArgumentError( + "MapArray expects `DataType::Struct` as its inner logical type".to_string(), + )); } + if field.data_type() != inner_field.data_type() { + return Err(ArrowError::InvalidArgumentError( + "MapArray expects `field.data_type` to match its inner DataType".to_string(), + )); + } + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + Ok(Self { + data_type, + field, + offsets, + validity, + }) + } + + /// Creates a new [`MapArray`]. + /// # Panics + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the field' length. + /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`], + /// * The validity is not `None` and its length is different from `offsets.len() - 1`. + pub fn new( + data_type: DataType, + offsets: Buffer, + field: Arc, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, field, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + field: Arc, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, field, validity) } /// Returns a new null [`MapArray`] of `length`. @@ -48,41 +117,9 @@ impl MapArray { let field = new_empty_array(Self::get_field(&data_type).data_type().clone()).into(); Self::from_data(data_type, Buffer::from(vec![0i32]), field, None) } +} - /// Returns a new [`MapArray`]. - /// # Panic - /// This function panics iff: - /// * The `data_type`'s physical type is not consistent with [`MapArray`], - /// * The `offsets` and `field` are inconsistent - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. - pub fn from_data( - data_type: DataType, - offsets: Buffer, - field: Arc, - validity: Option, - ) -> Self { - check_offsets(&offsets, field.len()); - - if let Some(ref validity) = validity { - assert_eq!(offsets.len() - 1, validity.len()); - } - - if let DataType::Struct(inner) = Self::get_field(&data_type).data_type() { - if inner.len() != 2 { - panic!("MapArray expects its inner `Struct` to have 2 fields (keys and maps)") - } - } else { - panic!("MapArray expects `DataType::Struct` as its inner logical type") - } - - Self { - data_type, - field, - offsets, - validity, - } - } - +impl MapArray { /// Returns a slice of this [`MapArray`]. /// # Panics /// panics iff `offset + length >= self.len()` @@ -110,6 +147,20 @@ impl MapArray { validity, } } + + pub(crate) fn try_get_field(data_type: &DataType) -> Result<&Field, ArrowError> { + if let DataType::Map(field, _) = data_type.to_logical_type() { + Ok(field.as_ref()) + } else { + Err(ArrowError::oos( + "The data_type's logical type must be DataType::Map", + )) + } + } + + pub(crate) fn get_field(data_type: &DataType) -> &Field { + Self::try_get_field(data_type).unwrap() + } } // Accessors diff --git a/src/array/null.rs b/src/array/null.rs index 3f7d89dc22a..a93cec49d42 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -2,7 +2,8 @@ use crate::{bitmap::Bitmap, datatypes::DataType}; use crate::{ array::{Array, FromFfi, ToFfi}, - error::Result, + datatypes::PhysicalType, + error::ArrowError, ffi, }; @@ -14,6 +15,33 @@ pub struct NullArray { } impl NullArray { + /// Returns a new [`NullArray`]. + /// # Errors + /// This function errors iff: + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`]. + pub fn try_new(data_type: DataType, length: usize) -> Result { + if data_type.to_physical_type() != PhysicalType::Null { + return Err(ArrowError::oos( + "BooleanArray can only be initialized with a DataType whose physical type is Boolean", + )); + } + + Ok(Self { data_type, length }) + } + + /// Returns a new [`NullArray`]. + /// # Panics + /// This function errors iff: + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`]. + pub fn new(data_type: DataType, length: usize) -> Self { + Self::try_new(data_type, length).unwrap() + } + + /// Alias for `new` + pub fn from_data(data_type: DataType, length: usize) -> Self { + Self::new(data_type, length) + } + /// Returns a new empty [`NullArray`]. pub fn new_empty(data_type: DataType) -> Self { Self::from_data(data_type, 0) @@ -23,12 +51,9 @@ impl NullArray { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::from_data(data_type, length) } +} - /// Returns a new [`NullArray`]. - pub fn from_data(data_type: DataType, length: usize) -> Self { - Self { data_type, length } - } - +impl NullArray { /// Returns a slice of the [`NullArray`]. pub fn slice(&self, _offset: usize, length: usize) -> Self { Self { @@ -66,9 +91,11 @@ impl Array for NullArray { fn slice(&self, offset: usize, length: usize) -> Box { Box::new(self.slice(offset, length)) } + unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box { Box::new(self.slice(offset, length)) } + fn with_validity(&self, _: Option) -> Box { panic!("cannot set validity of a null array") } @@ -95,7 +122,7 @@ unsafe impl ToFfi for NullArray { } impl FromFfi for NullArray { - unsafe fn try_from_ffi(array: A) -> Result { + unsafe fn try_from_ffi(array: A) -> Result { let data_type = array.data_type().clone(); Ok(Self::from_data(data_type, array.array().len())) } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 2f9df8d66fe..a71de921d27 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -39,51 +39,76 @@ pub struct PrimitiveArray { } impl PrimitiveArray { + /// The canonical method to create a [`PrimitiveArray`]. + /// # Errors + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + pub fn try_new( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Result { + if validity + .as_ref() + .map_or(false, |validity| validity.len() != values.len()) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + if data_type.to_physical_type() != PhysicalType::Primitive(T::PRIMITIVE) { + return Err(ArrowError::oos( + "BooleanArray can only be initialized with a DataType whose physical type is Primitive", + )); + } + + Ok(Self { + data_type, + values, + validity, + }) + } + + /// The canonical method to create a [`PrimitiveArray`] + /// # Panics + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::new(data_type, values, validity) + } + /// Returns a new empty [`PrimitiveArray`]. pub fn new_empty(data_type: DataType) -> Self { - Self::from_data(data_type, Buffer::new(), None) + Self::new(data_type, Buffer::new(), None) } /// Returns a new [`PrimitiveArray`] whose all slots are null / `None`. #[inline] pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::from_data( + Self::new( data_type, Buffer::new_zeroed(length), Some(Bitmap::new_zeroed(length)), ) } +} - /// The canonical method to create a [`PrimitiveArray`] out of low-end APIs. - /// # Panics - /// This function panics iff: - /// * `data_type` is not supported by the physical type - /// * The validity is not `None` and its length is different from the `values`'s length - pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { - if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { - Err(ArrowError::InvalidArgumentError(format!( - "Type {} does not support logical type {:?}", - std::any::type_name::(), - data_type - ))) - .unwrap() - } - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - Self { - data_type, - values, - validity, - } - } - +impl PrimitiveArray { /// Returns a slice of this [`PrimitiveArray`]. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. /// # Panic /// This function panics iff `offset + length >= self.len()`. #[inline] + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -98,6 +123,7 @@ impl PrimitiveArray { /// # Safety /// The caller must ensure that `offset + length <= self.len()`. #[inline] + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -113,6 +139,7 @@ impl PrimitiveArray { /// Sets the validity bitmap on this [`PrimitiveArray`]. /// # Panics /// This function panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") @@ -191,13 +218,13 @@ impl PrimitiveArray { if let Some(bitmap) = self.validity { match bitmap.into_mut() { - Left(bitmap) => Left(PrimitiveArray::from_data( + Left(bitmap) => Left(PrimitiveArray::new( self.data_type, self.values, Some(bitmap), )), Right(mutable_bitmap) => match self.values.into_mut() { - Left(buffer) => Left(PrimitiveArray::from_data( + Left(buffer) => Left(PrimitiveArray::new( self.data_type, buffer, Some(mutable_bitmap.into()), @@ -211,7 +238,7 @@ impl PrimitiveArray { } } else { match self.values.into_mut() { - Left(buffer) => Left(PrimitiveArray::from_data(self.data_type, buffer, None)), + Left(values) => Left(PrimitiveArray::new(self.data_type, values, None)), Right(values) => Right(MutablePrimitiveArray::from_data( self.data_type, values, diff --git a/src/array/specification.rs b/src/array/specification.rs index 530daed5300..93171762f56 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -1,6 +1,18 @@ use crate::error::{ArrowError, Result}; use crate::types::Offset; +pub fn try_check_offsets_bounds(offsets: &[O], values_len: usize) -> Result { + if let Some(last_offset) = offsets.last() { + if last_offset.to_usize() > values_len { + Err(ArrowError::oos("offsets must not exceed the values length")) + } else { + Ok(last_offset.to_usize()) + } + } else { + Err(ArrowError::oos("offsets must have at least one element")) + } +} + pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usize { assert!( !offsets.is_empty(), @@ -58,9 +70,11 @@ pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Re // check bounds if offsets .last() - .map_or(false, |last| last.to_usize() > values.len()) + .map_or(true, |last| last.to_usize() > values.len()) { - return Err(ArrowError::oos("offsets must not exceed values length")); + return Err(ArrowError::oos( + "offsets must have at least one element and must not exceed values length", + )); }; Ok(()) @@ -81,9 +95,11 @@ pub fn try_check_offsets(offsets: &[O], values_len: usize) -> Result< Err(ArrowError::oos("offsets must be monotonically increasing")) } else if offsets .last() - .map_or(false, |last| last.to_usize() > values_len) + .map_or(true, |last| last.to_usize() > values_len) { - Err(ArrowError::oos("offsets must not exceed values length")) + Err(ArrowError::oos( + "offsets must have at least one element and must not exceed values length", + )) } else { Ok(()) } diff --git a/src/array/struct_/mod.rs b/src/array/struct_/mod.rs index d4ae90e3fb4..dc80cd906a4 100644 --- a/src/array/struct_/mod.rs +++ b/src/array/struct_/mod.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use crate::{ bitmap::Bitmap, datatypes::{DataType, Field}, + error::ArrowError, }; use super::{new_empty_array, new_null_array, Array}; @@ -26,7 +27,7 @@ mod iterator; /// Field::new("c", DataType::Int32, false), /// ]; /// -/// let array = StructArray::from_data(DataType::Struct(fields), vec![boolean, int], None); +/// let array = StructArray::new(DataType::Struct(fields), vec![boolean, int], None); /// ``` #[derive(Clone)] pub struct StructArray { @@ -36,6 +37,101 @@ pub struct StructArray { } impl StructArray { + /// Returns a new [`StructArray`]. + /// # Errors + /// This function errors iff: + /// * `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Struct`]. + /// * the children of `data_type` are empty + /// * the values's len is different from children's length + /// * any of the values's data type is different from its corresponding children' data type + /// * any element of values has a different length than the first element + /// * the validity's length is not equal to the length of the first element + pub fn try_new( + data_type: DataType, + values: Vec>, + validity: Option, + ) -> Result { + let fields = Self::try_get_fields(&data_type)?; + if fields.is_empty() { + return Err(ArrowError::oos( + "A StructArray must contain at least one field", + )); + } + if fields.len() != values.len() { + return Err(ArrowError::oos( + "A StructArray must a number of fields in its DataType equal to the number of child values", + )); + } + + fields + .iter().map(|a| &a.data_type) + .zip(values.iter().map(|a| a.data_type())) + .enumerate() + .try_for_each(|(index, (data_type, child))| { + if data_type != child { + Err(ArrowError::oos(format!( + "The children DataTypes of a StructArray must equal the children data types. + However, the field {index} has data type {data_type:?} but the value has data type {child:?}" + ))) + } else { + Ok(()) + } + })?; + + let len = values[0].len(); + values + .iter() + .map(|a| a.len()) + .enumerate() + .try_for_each(|(index, a_len)| { + if a_len != len { + Err(ArrowError::oos(format!( + "The children DataTypes of a StructArray must equal the children data types. + However, the values {index} has a length of {a_len}, which is different from values 0, {len}." + ))) + } else { + Ok(()) + } + })?; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != len) + { + return Err(ArrowError::oos( + "The validity length of a StructArray must match its number of elements", + )); + } + + Ok(Self { + data_type, + values, + validity, + }) + } + + /// Returns a new [`StructArray`] + /// # Panics + /// This function panics iff: + /// * `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Struct`]. + /// * the children of `data_type` are empty + /// * the values's len is different from children's length + /// * any of the values's data type is different from its corresponding children' data type + /// * any element of values has a different length than the first element + /// * the validity's length is not equal to the length of the first element + pub fn new(data_type: DataType, values: Vec>, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + values: Vec>, + validity: Option, + ) -> Self { + Self::new(data_type, values, validity) + } + /// Creates an empty [`StructArray`]. pub fn new_empty(data_type: DataType) -> Self { if let DataType::Struct(fields) = &data_type { @@ -43,7 +139,7 @@ impl StructArray { .iter() .map(|field| new_empty_array(field.data_type().clone()).into()) .collect(); - Self::from_data(data_type, values, None) + Self::new(data_type, values, None) } else { panic!("StructArray must be initialized with DataType::Struct"); } @@ -56,44 +152,17 @@ impl StructArray { .iter() .map(|field| new_null_array(field.data_type().clone(), length).into()) .collect(); - Self::from_data(data_type, values, Some(Bitmap::new_zeroed(length))) + Self::new(data_type, values, Some(Bitmap::new_zeroed(length))) } else { panic!("StructArray must be initialized with DataType::Struct"); } } +} - /// Canonical method to create a [`StructArray`]. - /// # Panics - /// * fields are empty - /// * values's len is different from Fields' length. - /// * any element of values has a different length than the first element. - pub fn from_data( - data_type: DataType, - values: Vec>, - validity: Option, - ) -> Self { - let fields = Self::get_fields(&data_type); - assert!(!fields.is_empty()); - assert_eq!(fields.len(), values.len()); - assert!( - fields - .iter() - .map(|f| f.data_type()) - .eq(values.iter().map(|a| a.data_type())), - "The fields' datatypes must equal the values datatypes" - ); - assert!(values.iter().all(|x| x.len() == values[0].len())); - if let Some(ref validity) = validity { - assert_eq!(values[0].len(), validity.len()); - } - Self { - data_type, - values, - validity, - } - } - +// must use +impl StructArray { /// Deconstructs the [`StructArray`] into its individual components. + #[must_use] pub fn into_data(self) -> (Vec, Vec>, Option) { let Self { data_type, @@ -113,6 +182,7 @@ impl StructArray { /// * `offset + length` must be smaller than `self.len()`. /// # Implementation /// This operation is `O(F)` where `F` is the number of fields. + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -126,6 +196,7 @@ impl StructArray { /// This operation is `O(F)` where `F` is the number of fields. /// # Safety /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity @@ -145,6 +216,7 @@ impl StructArray { /// Sets the validity bitmap on this [`StructArray`]. /// # Panic /// This function panics iff `validity.len() != self.len()`. + #[must_use] pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity should be as least as large as the array") @@ -181,13 +253,19 @@ impl StructArray { impl StructArray { /// Returns the fields the `DataType::Struct`. - pub fn get_fields(data_type: &DataType) -> &[Field] { - match data_type { - DataType::Struct(fields) => fields, - DataType::Extension(_, inner, _) => Self::get_fields(inner), - _ => panic!("Wrong datatype passed to Struct."), + pub(crate) fn try_get_fields(data_type: &DataType) -> Result<&[Field], ArrowError> { + match data_type.to_logical_type() { + DataType::Struct(fields) => Ok(fields), + _ => Err(ArrowError::oos( + "Struct array must be created with a DataType whose physical type is Struct", + )), } } + + /// Returns the fields the `DataType::Struct`. + pub fn get_fields(data_type: &DataType) -> &[Field] { + Self::try_get_fields(data_type).unwrap() + } } impl Array for StructArray { diff --git a/src/array/union/mod.rs b/src/array/union/mod.rs index 4cbc51d6edf..351f88f56cb 100644 --- a/src/array/union/mod.rs +++ b/src/array/union/mod.rs @@ -4,6 +4,7 @@ use crate::{ bitmap::Bitmap, buffer::Buffer, datatypes::{DataType, Field, UnionMode}, + error::ArrowError, scalar::{new_scalar, Scalar}, }; @@ -14,6 +15,7 @@ pub(super) mod fmt; mod iterator; type FieldEntry = (usize, Arc); +type UnionComponents<'a> = (&'a [Field], Option<&'a [i32]>, UnionMode); /// [`UnionArray`] represents an array whose each slot can contain different values. /// @@ -37,6 +39,94 @@ pub struct UnionArray { } impl UnionArray { + /// Returns a new [`UnionArray`]. + /// # Errors + /// This function errors iff: + /// * `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Union`]. + /// * the fields's len is different from the `data_type`'s children's length + /// * any of the values's data type is different from its corresponding children' data type + pub fn try_new( + data_type: DataType, + types: Buffer, + fields: Vec>, + offsets: Option>, + ) -> Result { + let (f, ids, mode) = Self::try_get_all(&data_type)?; + + if f.len() != fields.len() { + return Err(ArrowError::oos( + "The number of `fields` must equal the number of children fields in DataType::Union", + )); + }; + + f + .iter().map(|a| a.data_type()) + .zip(fields.iter().map(|a| a.data_type())) + .enumerate() + .try_for_each(|(index, (data_type, child))| { + if data_type != child { + Err(ArrowError::oos(format!( + "The children DataTypes of a UnionArray must equal the children data types. + However, the field {index} has data type {data_type:?} but the value has data type {child:?}" + ))) + } else { + Ok(()) + } + })?; + + if offsets.is_none() != mode.is_sparse() { + return Err(ArrowError::oos( + "The offsets must be set when the Union is dense and vice-versa", + )); + } + + let fields_hash = ids.as_ref().map(|ids| { + ids.iter() + .map(|x| *x as i8) + .enumerate() + .zip(fields.iter().cloned()) + .map(|((i, type_), field)| (type_, (i, field))) + .collect() + }); + + // not validated: + // * `offsets` is valid + // * max id < fields.len() + Ok(Self { + data_type, + fields_hash, + fields, + offsets, + types, + offset: 0, + }) + } + + /// Returns a new [`UnionArray`]. + /// # Panics + /// This function panics iff: + /// * `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Union`]. + /// * the fields's len is different from the `data_type`'s children's length + /// * any of the values's data type is different from its corresponding children' data type + pub fn new( + data_type: DataType, + types: Buffer, + fields: Vec>, + offsets: Option>, + ) -> Self { + Self::try_new(data_type, types, fields, offsets).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + types: Buffer, + fields: Vec>, + offsets: Option>, + ) -> Self { + Self::new(data_type, types, fields, offsets) + } + /// Creates a new null [`UnionArray`]. pub fn new_null(data_type: DataType, length: usize) -> Self { if let DataType::Union(f, _, mode) = &data_type { @@ -86,51 +176,9 @@ impl UnionArray { panic!("Union struct must be created with the corresponding Union DataType") } } +} - /// Creates a new [`UnionArray`]. - pub fn from_data( - data_type: DataType, - types: Buffer, - fields: Vec>, - offsets: Option>, - ) -> Self { - let (f, ids, mode) = Self::get_all(&data_type); - - if f.len() != fields.len() { - panic!("The number of `fields` must equal the number of fields in the Union DataType") - }; - let same_data_types = f - .iter() - .zip(fields.iter()) - .all(|(f, array)| f.data_type() == array.data_type()); - if !same_data_types { - panic!("All fields' datatype in the union must equal the datatypes on the fields.") - } - if offsets.is_none() != mode.is_sparse() { - panic!("Sparsness flag must equal to noness of offsets in UnionArray") - } - let fields_hash = ids.as_ref().map(|ids| { - ids.iter() - .map(|x| *x as i8) - .enumerate() - .zip(fields.iter().cloned()) - .map(|((i, type_), field)| (type_, (i, field))) - .collect() - }); - - // not validated: - // * `offsets` is valid - // * max id < fields.len() - Self { - data_type, - fields_hash, - fields, - offsets, - types, - offset: 0, - } - } - +impl UnionArray { /// Returns a slice of this [`UnionArray`]. /// # Implementation /// This operation is `O(F)` where `F` is the number of fields. @@ -254,13 +302,21 @@ impl Array for UnionArray { } impl UnionArray { - fn get_all(data_type: &DataType) -> (&[Field], Option<&[i32]>, UnionMode) { + fn try_get_all(data_type: &DataType) -> Result { match data_type.to_logical_type() { - DataType::Union(fields, ids, mode) => (fields, ids.as_ref().map(|x| x.as_ref()), *mode), - _ => panic!("Wrong datatype passed to UnionArray."), + DataType::Union(fields, ids, mode) => { + Ok((fields, ids.as_ref().map(|x| x.as_ref()), *mode)) + } + _ => Err(ArrowError::oos( + "The UnionArray requires a logical type of DataType::Union", + )), } } + fn get_all(data_type: &DataType) -> (&[Field], Option<&[i32]>, UnionMode) { + Self::try_get_all(data_type).unwrap() + } + /// Returns all fields from [`DataType::Union`]. /// # Panic /// Panics iff `data_type`'s logical type is not [`DataType::Union`]. diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 5d9220e25dc..a2538dd58a0 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -7,7 +7,7 @@ use crate::{ use either::Either; use super::{ - specification::{check_offsets_minimal, try_check_offsets_and_utf8}, + specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, Array, GenericBinaryArray, Offset, }; @@ -44,7 +44,78 @@ pub struct Utf8Array { validity: Option, } +// constructors impl Utf8Array { + /// Returns a new [`Utf8Array`]. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + pub fn try_new( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Result { + try_check_offsets_and_utf8(&offsets, &values)?; + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(ArrowError::oos( + "validity mask length must match the number of values", + )); + } + + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(ArrowError::oos( + "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8", + )); + } + + Ok(Self { + data_type, + offsets, + values, + validity, + }) + } + + /// Creates a new [`Utf8Array`]. + /// # Panics + /// This function panics iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + pub fn new( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, values, validity).unwrap() + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, values, validity) + } + /// Returns a new empty [`Utf8Array`]. #[inline] pub fn new_empty(data_type: DataType) -> Self { @@ -69,36 +140,39 @@ impl Utf8Array { ) } - /// The canonical method to create a [`Utf8Array`] out of low-end APIs. - /// # Panics - /// This function panics iff: - /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The `offsets` and `values` are inconsistent - /// * The `values` between `offsets` are utf8 encoded - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. - pub fn from_data( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Utf8Array::try_new(data_type, offsets, values, validity).unwrap() + /// Returns the default [`DataType`], `DataType::Utf8` or `DataType::LargeUtf8` + pub fn default_data_type() -> DataType { + if O::is_large() { + DataType::LargeUtf8 + } else { + DataType::Utf8 + } } +} - /// The canonical method to create a [`Utf8Array`] out of low-end APIs. +// unsafe constructors +impl Utf8Array { + /// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity /// + /// # Errors /// This function returns an error iff: - /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The `offsets` and `values` are inconsistent - /// * The `values` between `offsets` are utf8 encoded - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. - pub fn try_new( + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// # Safety + /// This function is unsound iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(1)` + pub unsafe fn try_new_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + try_check_offsets_bounds(&offsets, values.len())?; + if validity .as_ref() .map_or(false, |validity| validity.len() != offsets.len() - 1) @@ -110,7 +184,7 @@ impl Utf8Array { if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(ArrowError::oos( - "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8", + "BinaryArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", )); } @@ -122,52 +196,51 @@ impl Utf8Array { }) } - /// Returns the default [`DataType`], `DataType::Utf8` or `DataType::LargeUtf8` - pub fn default_data_type() -> DataType { - if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - } + /// Creates a new [`Utf8Array`] without checking for offsets monotinicity. + /// + /// # Errors + /// This function returns an error iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// # Safety + /// This function is unsound iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() } - /// The same as [`Utf8Array::from_data`] but does not check for offsets nor utf8 validity. + /// Alias for [`new_unchecked`] /// # Safety - /// * `offsets` MUST be monotonically increasing; and - /// * every slice of `values` constructed from `offsets` MUST be valid utf8 - /// # Panics - /// This function panics iff: - /// * The `data_type`'s physical type is not consistent with the offset `O`. - /// * The last element of `offsets` is different from `values.len()`. - /// * The validity is not `None` and its length is different from `offsets.len() - 1`. + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 pub unsafe fn from_data_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Self { - check_offsets_minimal(&offsets, values.len()); - if let Some(ref validity) = validity { - assert_eq!(offsets.len() - 1, validity.len()); - } - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") - } - - Self { - data_type, - offsets, - values, - validity, - } + Self::new_unchecked(data_type, offsets, values, validity) } +} +// must use +impl Utf8Array { /// Returns a slice of this [`Utf8Array`]. /// # Implementation /// This operation is `O(1)` as it amounts to essentially increase two ref counts. /// # Panic /// This function panics iff `offset + length >= self.len()`. + #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { assert!( offset + length <= self.len(), @@ -180,6 +253,7 @@ impl Utf8Array { /// This operation is `O(1)` as it amounts to essentially increase two ref counts. /// # Safety /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { let validity = self .validity diff --git a/tests/it/array/list/mod.rs b/tests/it/array/list/mod.rs index 78bc8dcd719..2a4e1feca4e 100644 --- a/tests/it/array/list/mod.rs +++ b/tests/it/array/list/mod.rs @@ -23,7 +23,7 @@ fn debug() { } #[test] -#[should_panic(expected = "The child's datatype must match the inner type of the \'data_type\'")] +#[should_panic] fn test_nested_panic() { let values = Buffer::from_slice([1, 2, 3, 4, 5]); let values = PrimitiveArray::::from_data(DataType::Int32, values, None);