Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Typed Buffers in Arrays (#1811) (#1176) #3743

Merged
merged 3 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::raw_pointer::RawPtrBox;
use crate::{print_long_array, Array, ArrayAccessor};
use crate::{Array, ArrayAccessor};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::bit_mask::combine_option_bitmap;
use arrow_data::ArrayData;
Expand Down Expand Up @@ -67,9 +67,7 @@ use std::any::Any;
#[derive(Clone)]
pub struct BooleanArray {
data: ArrayData,
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
/// stored in `data`, so it's safe to store.
raw_values: RawPtrBox<u8>,
raw_values: Buffer,
}

impl std::fmt::Debug for BooleanArray {
Expand Down Expand Up @@ -102,7 +100,7 @@ impl BooleanArray {
///
/// Note this doesn't take the offset of this array into account.
pub fn values(&self) -> &Buffer {
&self.data.buffers()[0]
&self.raw_values
}

/// Returns the number of non null, true values within this array
Expand Down Expand Up @@ -328,13 +326,8 @@ impl From<ArrayData> for BooleanArray {
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let ptr = data.buffers()[0].as_ptr();
Self {
data,
// SAFETY:
// ArrayData must be valid, and validated data type above
raw_values: unsafe { RawPtrBox::new(ptr) },
}
let raw_values = data.buffers()[0].clone();
Self { data, raw_values }
}
}

Expand Down
42 changes: 17 additions & 25 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{empty_offsets, print_long_array};
use crate::array::{get_offsets, print_long_array};
use crate::builder::GenericByteBuilder;
use crate::iterator::ArrayIter;
use crate::raw_pointer::RawPtrBox;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, OffsetSizeTrait};
use arrow_buffer::ArrowNativeType;
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
Expand All @@ -39,16 +39,16 @@ use std::any::Any;
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
pub struct GenericByteArray<T: ByteArrayType> {
data: ArrayData,
value_offsets: RawPtrBox<T::Offset>,
value_data: RawPtrBox<u8>,
value_offsets: OffsetBuffer<T::Offset>,
value_data: Buffer,
}

impl<T: ByteArrayType> Clone for GenericByteArray<T> {
fn clone(&self) -> Self {
Self {
data: self.data.clone(),
value_offsets: self.value_offsets,
value_data: self.value_data,
value_offsets: self.value_offsets.clone(),
value_data: self.value_data.clone(),
}
}
}
Expand All @@ -68,7 +68,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {

/// Returns the raw value data
pub fn value_data(&self) -> &[u8] {
self.data.buffers()[1].as_slice()
self.value_data.as_slice()
}

/// Returns true if all data within this array is ASCII
Expand All @@ -82,15 +82,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[T::Offset] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the element at index `i`
Expand Down Expand Up @@ -161,6 +153,8 @@ impl<T: ByteArrayType> GenericByteArray<T> {
.slice_with_length(self.data.offset() * element_len, value_len * element_len);

drop(self.data);
drop(self.value_data);
drop(self.value_offsets);

let try_mutable_null_buffer = match null_bit_buffer {
None => Ok(None),
Expand Down Expand Up @@ -280,18 +274,16 @@ impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
T::Offset::PREFIX,
T::PREFIX,
);
// Handle case of empty offsets
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
true => empty_offsets::<T::Offset>().as_ptr() as *const _,
false => data.buffers()[0].as_ptr(),
};
let values = data.buffers()[1].as_ptr();
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_data = data.buffers()[1].clone();
Self {
data,
// SAFETY:
// ArrayData must be valid, and validated data type above
value_offsets: unsafe { RawPtrBox::new(offsets) },
value_data: unsafe { RawPtrBox::new(values) },
value_offsets,
value_data,
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions arrow-array/src/array/fixed_size_binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::iterator::FixedSizeBinaryIter;
use crate::raw_pointer::RawPtrBox;
use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray};
use crate::{Array, ArrayAccessor, FixedSizeListArray};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -50,7 +50,7 @@ use std::any::Any;
#[derive(Clone)]
pub struct FixedSizeBinaryArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
length: i32,
}

Expand Down Expand Up @@ -357,14 +357,14 @@ impl From<ArrayData> for FixedSizeBinaryArray {
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_data = data.buffers()[0].as_ptr();
let value_data = data.buffers()[0].clone();
let length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
Self {
data,
value_data: unsafe { RawPtrBox::new(value_data) },
value_data,
length,
}
}
Expand Down
5 changes: 2 additions & 3 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
use crate::{
make_array, print_long_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
};
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
Expand Down
46 changes: 11 additions & 35 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::make_array;
use crate::array::{get_offsets, make_array, print_long_array};
use crate::builder::{GenericListBuilder, PrimitiveBuilder};
use crate::{
iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, Array,
ArrayAccessor, ArrayRef, ArrowPrimitiveType,
iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
};
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field};
Expand All @@ -45,35 +45,24 @@ impl OffsetSizeTrait for i64 {
const PREFIX: &'static str = "Large";
}

/// Returns a slice of `OffsetSize` consisting of a single zero value
#[inline]
pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static [OffsetSize] {
static OFFSET: &[i64] = &[0];
// SAFETY:
// OffsetSize is ArrowNativeType and is therefore trivially transmutable
let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
assert!(prefix.is_empty() && suffix.is_empty());
val
}

/// Generic struct for a variable-size list array.
///
/// Columnar format in Apache Arrow:
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
///
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]`
pub struct GenericListArray<OffsetSize> {
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
values: ArrayRef,
value_offsets: RawPtrBox<OffsetSize>,
value_offsets: OffsetBuffer<OffsetSize>,
}

impl<OffsetSize> Clone for GenericListArray<OffsetSize> {
impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> {
fn clone(&self) -> Self {
Self {
data: self.data.clone(),
values: self.values.clone(),
value_offsets: self.value_offsets,
value_offsets: self.value_offsets.clone(),
}
}
}
Expand Down Expand Up @@ -118,15 +107,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
Comment on lines -121 to -129
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that's good. 👍

&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -242,15 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}

let values = make_array(values);
// Handle case of empty offsets
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
false => data.buffers()[0].as_ptr(),
};

// SAFETY:
// Verified list type in call to `Self::get_type`
let value_offsets = unsafe { RawPtrBox::new(offsets) };
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };

Ok(Self {
data,
values,
Expand Down
27 changes: 6 additions & 21 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::raw_pointer::RawPtrBox;
use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, StructArray};
use crate::array::{get_offsets, print_long_array};
use crate::{make_array, Array, ArrayRef, StringArray, StructArray};
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field};
Expand All @@ -38,7 +39,7 @@ pub struct MapArray {
/// The second child of `entries`, the "values" of this MapArray
values: ArrayRef,
/// The start and end offsets of each entry
value_offsets: RawPtrBox<i32>,
value_offsets: OffsetBuffer<i32>,
}

impl MapArray {
Expand Down Expand Up @@ -86,15 +87,7 @@ impl MapArray {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[i32] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -159,18 +152,10 @@ impl MapArray {
let keys = make_array(entries.child_data()[0].clone());
let values = make_array(entries.child_data()[1].clone());
let entries = make_array(entries);
let value_offsets = data.buffers()[0].as_ptr();

// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { RawPtrBox::<i32>::new(value_offsets) };
unsafe {
if (*value_offsets.as_ptr().offset(0)) != 0 {
return Err(ArrowError::InvalidArgumentError(String::from(
"offsets do not start at zero",
)));
}
}
let value_offsets = unsafe { get_offsets(&data) };

Ok(Self {
data,
Expand Down
27 changes: 25 additions & 2 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
mod binary_array;

use crate::types::*;
use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer};
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
Expand Down Expand Up @@ -636,8 +638,29 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
make_array(ArrayData::new_null(data_type, length))
}

// Helper function for printing potentially long arrays.
pub(crate) fn print_long_array<A, F>(
/// Helper function that gets offset from an [`ArrayData`]
///
/// # Safety
///
/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer
unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will eventually get removed as we make ArrayData an enumeration of typed buffers

match data.is_empty() && data.buffers()[0].is_empty() {
true => OffsetBuffer::new_empty(),
false => {
let buffer = ScalarBuffer::new(
data.buffers()[0].clone(),
data.offset(),
data.len() + 1,
);
// Safety:
// ArrayData is valid
unsafe { OffsetBuffer::new_unchecked(buffer) }
}
}
}

/// Helper function for printing potentially long arrays.
fn print_long_array<A, F>(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a drive-by cleanup

array: &A,
f: &mut std::fmt::Formatter,
print_item: F,
Expand Down
Loading