From 9ea25f0408a9684e15018f2e1293df766172e978 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 2 Dec 2022 05:06:36 +0000 Subject: [PATCH 1/5] Added offset module --- src/array/binary/ffi.rs | 3 ++- src/array/binary/fmt.rs | 3 ++- src/array/binary/from.rs | 2 +- src/array/binary/iterator.rs | 3 ++- src/array/binary/mod.rs | 3 ++- src/array/binary/mutable.rs | 3 ++- src/array/binary/mutable_values.rs | 4 ++-- src/array/equal/binary.rs | 3 ++- src/array/equal/list.rs | 3 ++- src/array/equal/mod.rs | 1 + src/array/equal/utf8.rs | 3 ++- src/array/growable/binary.rs | 3 ++- src/array/growable/list.rs | 3 ++- src/array/growable/utf8.rs | 3 ++- src/array/growable/utils.rs | 5 +---- src/array/list/ffi.rs | 4 +++- src/array/list/fmt.rs | 2 +- src/array/list/iterator.rs | 2 +- src/array/list/mod.rs | 3 ++- src/array/list/mutable.rs | 3 ++- src/array/mod.rs | 3 +-- src/array/ord.rs | 1 + src/array/physical_binary.rs | 2 +- src/array/utf8/ffi.rs | 3 ++- src/array/utf8/fmt.rs | 3 ++- src/array/utf8/from.rs | 2 +- src/array/utf8/iterator.rs | 3 ++- src/array/utf8/mod.rs | 3 ++- src/array/utf8/mutable.rs | 3 ++- src/array/utf8/mutable_values.rs | 3 ++- src/compute/aggregate/min_max.rs | 3 ++- src/compute/cast/binary_to.rs | 1 + src/compute/cast/boolean_to.rs | 3 ++- src/compute/cast/mod.rs | 1 + src/compute/cast/primitive_to.rs | 1 + src/compute/cast/utf8_to.rs | 1 + src/compute/comparison/binary.rs | 3 ++- src/compute/comparison/utf8.rs | 3 ++- src/compute/contains.rs | 3 ++- src/compute/hash.rs | 3 ++- src/compute/length.rs | 1 + src/compute/like.rs | 3 ++- src/compute/regex_match.rs | 6 ++++-- src/compute/sort/binary.rs | 3 ++- src/compute/sort/mod.rs | 1 + src/compute/sort/row/mod.rs | 3 ++- src/compute/sort/utf8.rs | 4 ++-- src/compute/substring.rs | 1 + src/compute/take/binary.rs | 3 ++- src/compute/take/generic_binary.rs | 3 ++- src/compute/take/list.rs | 3 ++- src/compute/take/utf8.rs | 3 ++- src/compute/utf8.rs | 3 ++- src/ffi/mmap.rs | 3 ++- src/io/avro/read/nested.rs | 1 + src/io/avro/write/serialize.rs | 1 + src/io/csv/read_utils.rs | 15 ++++++++------- src/io/csv/write/serialize.rs | 6 ++++-- src/io/ipc/read/array/binary.rs | 3 ++- src/io/ipc/read/array/list.rs | 3 ++- src/io/ipc/read/array/utf8.rs | 3 ++- src/io/ipc/write/serialize.rs | 3 ++- src/io/json/read/deserialize.rs | 1 + src/io/json/write/serialize.rs | 1 + src/io/json_integration/read/array.rs | 1 + src/io/odbc/write/serialize.rs | 1 + src/io/orc/read/mod.rs | 5 ++--- src/io/parquet/read/deserialize/binary/basic.rs | 3 ++- .../parquet/read/deserialize/binary/dictionary.rs | 3 ++- src/io/parquet/read/deserialize/binary/nested.rs | 4 ++-- src/io/parquet/read/deserialize/binary/utils.rs | 2 +- src/io/parquet/read/statistics/binary.rs | 3 ++- src/io/parquet/read/statistics/utf8.rs | 3 ++- src/io/parquet/write/binary/basic.rs | 3 ++- src/io/parquet/write/binary/nested.rs | 3 ++- src/io/parquet/write/nested/def.rs | 2 +- src/io/parquet/write/nested/mod.rs | 2 +- src/io/parquet/write/pages.rs | 3 ++- src/io/parquet/write/utf8/basic.rs | 3 ++- src/io/parquet/write/utf8/nested.rs | 3 ++- src/lib.rs | 1 + src/offset.rs | 2 ++ src/scalar/binary.rs | 2 +- src/scalar/list.rs | 2 +- src/scalar/utf8.rs | 2 +- src/temporal_conversions.rs | 3 ++- src/util/bench_util.rs | 2 +- tests/it/array/equal/utf8.rs | 1 + tests/it/compute/length.rs | 1 + tests/it/compute/regex_match.rs | 3 ++- tests/it/compute/substring.rs | 2 +- tests/it/compute/utf8.rs | 2 +- tests/it/io/parquet/mod.rs | 1 + 93 files changed, 162 insertions(+), 90 deletions(-) create mode 100644 src/offset.rs diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 2c6792237ad..f592773f56e 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -1,7 +1,8 @@ use crate::{ - array::{FromFfi, Offset, ToFfi}, + array::{FromFfi, ToFfi}, bitmap::align, ffi, + offset::Offset, }; use crate::error::Result; diff --git a/src/array/binary/fmt.rs b/src/array/binary/fmt.rs index c068d0d3656..c2ec8737fb2 100644 --- a/src/array/binary/fmt.rs +++ b/src/array/binary/fmt.rs @@ -1,7 +1,8 @@ use std::fmt::{Debug, Formatter, Result, Write}; +use crate::offset::Offset; + use super::super::fmt::write_vec; -use super::super::Offset; use super::BinaryArray; pub fn write_value(array: &BinaryArray, index: usize, f: &mut W) -> Result { diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs index aa575ccd9cc..8556da6906f 100644 --- a/src/array/binary/from.rs +++ b/src/array/binary/from.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::array::Offset; +use crate::offset::Offset; use super::{BinaryArray, MutableBinaryArray}; diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 2af79e6a296..042913a71d9 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -1,6 +1,7 @@ use crate::{ - array::{ArrayAccessor, ArrayValuesIter, Offset}, + array::{ArrayAccessor, ArrayValuesIter}, bitmap::utils::{BitmapIter, ZipValidity}, + offset::Offset, }; use super::{BinaryArray, MutableBinaryValuesArray}; diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index e7a4e9fe8c0..612d988215e 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -6,6 +6,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::Error, + offset::Offset, trusted_len::TrustedLen, }; @@ -13,7 +14,7 @@ use either::Either; use super::{ specification::{try_check_offsets, try_check_offsets_bounds}, - Array, GenericBinaryArray, Offset, + Array, GenericBinaryArray, }; mod ffi; diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 15035aed0ed..36f31eee5a5 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -1,13 +1,14 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ - array::{Array, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush}, + array::{Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush}, bitmap::{ utils::{BitmapIter, ZipValidity}, Bitmap, MutableBitmap, }, datatypes::DataType, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 53a43d69c7a..1d608b7403f 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -3,12 +3,12 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ specification::{check_offsets_minimal, try_check_offsets}, - Array, ArrayAccessor, ArrayValuesIter, MutableArray, Offset, TryExtend, TryExtendFromSelf, - TryPush, + Array, ArrayAccessor, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; diff --git a/src/array/equal/binary.rs b/src/array/equal/binary.rs index 1c86fab6dce..bed8588efb5 100644 --- a/src/array/equal/binary.rs +++ b/src/array/equal/binary.rs @@ -1,4 +1,5 @@ -use crate::array::{BinaryArray, Offset}; +use crate::array::BinaryArray; +use crate::offset::Offset; pub(super) fn equal(lhs: &BinaryArray, rhs: &BinaryArray) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/equal/list.rs b/src/array/equal/list.rs index 4eee0b821e2..26faa1598fa 100644 --- a/src/array/equal/list.rs +++ b/src/array/equal/list.rs @@ -1,4 +1,5 @@ -use crate::array::{Array, ListArray, Offset}; +use crate::array::{Array, ListArray}; +use crate::offset::Offset; pub(super) fn equal(lhs: &ListArray, rhs: &ListArray) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index aa2ea602882..2bb3ba77f1f 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -1,3 +1,4 @@ +use crate::offset::Offset; use crate::types::NativeType; use super::*; diff --git a/src/array/equal/utf8.rs b/src/array/equal/utf8.rs index 3a8f0e5f012..1327221ca33 100644 --- a/src/array/equal/utf8.rs +++ b/src/array/equal/utf8.rs @@ -1,4 +1,5 @@ -use crate::array::{Offset, Utf8Array}; +use crate::array::Utf8Array; +use crate::offset::Offset; pub(super) fn equal(lhs: &Utf8Array, rhs: &Utf8Array) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index c3403877e9e..fd91590a25d 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -1,9 +1,10 @@ use std::sync::Arc; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, bitmap::MutableBitmap, datatypes::DataType, + offset::Offset, }; use super::{ diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index 0e1e7ceb5f8..3fcbe3c4539 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -1,8 +1,9 @@ use std::sync::Arc; use crate::{ - array::{Array, ListArray, Offset}, + array::{Array, ListArray}, bitmap::MutableBitmap, + offset::Offset, }; use super::{ diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index adfdba2cd53..eed8ba30159 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -1,7 +1,8 @@ use std::sync::Arc; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, + offset::Offset, bitmap::MutableBitmap, }; diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 7e39df295de..d06c1116d48 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,7 +1,4 @@ -use crate::{ - array::{Array, Offset}, - bitmap::MutableBitmap, -}; +use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; pub(super) fn extend_offsets(buffer: &mut Vec, last_offset: &mut T, offsets: &[T]) { buffer.reserve(offsets.len() - 1); diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index c6f1b2985dd..9d0b19a85e0 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,6 +1,8 @@ use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; -use super::super::{ffi::ToFfi, Array, Offset}; +use crate::offset::Offset; + +use super::super::{ffi::ToFfi, Array}; use super::ListArray; unsafe impl ToFfi for ListArray { diff --git a/src/array/list/fmt.rs b/src/array/list/fmt.rs index e6103ded6cb..4b10fc8f936 100644 --- a/src/array/list/fmt.rs +++ b/src/array/list/fmt.rs @@ -1,6 +1,6 @@ use std::fmt::{Debug, Formatter, Result, Write}; -use crate::array::Offset; +use crate::offset::Offset; use super::super::fmt::{get_display, write_vec}; use super::ListArray; diff --git a/src/array/list/iterator.rs b/src/array/list/iterator.rs index 82b5c7dca5f..86a12dfe769 100644 --- a/src/array/list/iterator.rs +++ b/src/array/list/iterator.rs @@ -1,6 +1,6 @@ -use crate::array::Offset; use crate::array::{Array, ArrayAccessor, ArrayValuesIter}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::offset::Offset; use super::ListArray; diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index feb5a5df93b..7740307799d 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -3,13 +3,14 @@ use crate::{ buffer::Buffer, datatypes::{DataType, Field}, error::Error, + offset::Offset, }; use std::sync::Arc; use super::{ new_empty_array, specification::{try_check_offsets, try_check_offsets_bounds}, - Array, Offset, + Array, }; mod ffi; diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 57017998d0b..97785a01740 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -4,11 +4,12 @@ use crate::{ array::{ physical_binary::{extend_validity, try_extend_offsets}, specification::try_check_offsets, - Array, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush, + Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::{DataType, Field}, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; diff --git a/src/array/mod.rs b/src/array/mod.rs index 45e83a80803..5aa1dd1eb64 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -439,7 +439,6 @@ pub use iterator::ArrayValuesIter; pub use equal::equal; pub use fmt::{get_display, get_value_display}; -pub use crate::types::Offset; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; @@ -483,7 +482,7 @@ pub trait TryExtendFromSelf { /// 1. `offsets.len() > 0` /// 2. `offsets[i] >= offsets[i-1] for all i` /// 3. `offsets[i] < values.len() for all i` -pub unsafe trait GenericBinaryArray: Array { +pub unsafe trait GenericBinaryArray: Array { /// The values of the array fn values(&self) -> &[u8]; /// The offsets of the array diff --git a/src/array/ord.rs b/src/array/ord.rs index 639317165ab..e42c7fa0fc7 100644 --- a/src/array/ord.rs +++ b/src/array/ord.rs @@ -4,6 +4,7 @@ use std::cmp::Ordering; use crate::datatypes::*; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::{array::*, types::NativeType}; /// Compare the values at two arbitrary indices in two arrays. diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs index bf883f4e6e8..825ba01e5d0 100644 --- a/src/array/physical_binary.rs +++ b/src/array/physical_binary.rs @@ -1,6 +1,6 @@ -use crate::array::Offset; use crate::bitmap::MutableBitmap; use crate::error::Error; +use crate::offset::Offset; /// # Safety /// The caller must ensure that `iterator` is `TrustedLen`. diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index abaed6b28e7..2152dfade45 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -1,8 +1,9 @@ use crate::{ - array::{FromFfi, Offset, ToFfi}, + array::{FromFfi, ToFfi}, bitmap::align, error::Result, ffi, + offset::Offset, }; use super::Utf8Array; diff --git a/src/array/utf8/fmt.rs b/src/array/utf8/fmt.rs index 6ea28feae12..1b6868c4a0e 100644 --- a/src/array/utf8/fmt.rs +++ b/src/array/utf8/fmt.rs @@ -1,7 +1,8 @@ use std::fmt::{Debug, Formatter, Result, Write}; +use crate::offset::Offset; + use super::super::fmt::write_vec; -use super::super::Offset; use super::Utf8Array; pub fn write_value(array: &Utf8Array, index: usize, f: &mut W) -> Result { diff --git a/src/array/utf8/from.rs b/src/array/utf8/from.rs index 1a0a0a1f7e2..f6866998312 100644 --- a/src/array/utf8/from.rs +++ b/src/array/utf8/from.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::array::Offset; +use crate::offset::Offset; use super::{MutableUtf8Array, Utf8Array}; diff --git a/src/array/utf8/iterator.rs b/src/array/utf8/iterator.rs index 071fc54d8ae..2a5ba87c3e8 100644 --- a/src/array/utf8/iterator.rs +++ b/src/array/utf8/iterator.rs @@ -1,5 +1,6 @@ -use crate::array::{ArrayAccessor, ArrayValuesIter, Offset}; +use crate::array::{ArrayAccessor, ArrayValuesIter}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::offset::Offset; use super::{MutableUtf8Array, MutableUtf8ValuesArray, Utf8Array}; diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index f91e1466451..f011183ce6e 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -6,6 +6,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; @@ -13,7 +14,7 @@ use either::Either; use super::{ specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, - Array, GenericBinaryArray, Offset, + Array, GenericBinaryArray, }; mod ffi; diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 27d28dc64b2..4dc9b1304b8 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -2,13 +2,14 @@ use std::{iter::FromIterator, sync::Arc}; use crate::array::{physical_binary::*, TryExtendFromSelf}; use crate::{ - array::{Array, MutableArray, Offset, TryExtend, TryPush}, + array::{Array, MutableArray, TryExtend, TryPush}, bitmap::{ utils::{BitmapIter, ZipValidity}, Bitmap, MutableBitmap, }, datatypes::DataType, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index 5e33144452f..0da7ff8ff7c 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -3,11 +3,12 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ specification::{check_offsets_minimal, try_check_offsets_and_utf8}, - Array, ArrayValuesIter, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush, + Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, + offset::Offset, trusted_len::TrustedLen, }; diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 00f76b052bc..886cd509dd6 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -1,11 +1,12 @@ use crate::bitmap::utils::{BitChunkIterExact, BitChunksExact}; use crate::datatypes::{DataType, PhysicalType, PrimitiveType}; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::scalar::*; use crate::types::simd::*; use crate::types::NativeType; use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, bitmap::Bitmap, }; diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 44224321b55..d4b63c1e73d 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -1,6 +1,7 @@ use std::convert::TryFrom; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::{array::*, datatypes::DataType, types::NativeType}; use super::CastOptions; diff --git a/src/compute/cast/boolean_to.rs b/src/compute/cast/boolean_to.rs index ef24e4b4dff..1ce45c87118 100644 --- a/src/compute/cast/boolean_to.rs +++ b/src/compute/cast/boolean_to.rs @@ -1,6 +1,7 @@ use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, error::Result, + offset::Offset, types::NativeType, }; diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index edb6c223568..eda7bda78a0 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -18,6 +18,7 @@ use crate::{ array::*, datatypes::*, error::{Error, Result}, + offset::Offset, }; /// options defining how Cast kernels behave diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index a36f16e3deb..6c5c7bb8753 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -4,6 +4,7 @@ use num_traits::{AsPrimitive, Float, ToPrimitive}; use crate::datatypes::IntervalUnit; use crate::error::Result; +use crate::offset::Offset; use crate::types::{days_ms, f16, months_days_ns}; use crate::{ array::*, diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 996889174a2..2625497ed67 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -6,6 +6,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, + offset::Offset, temporal_conversions::{ utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, diff --git a/src/compute/comparison/binary.rs b/src/compute/comparison/binary.rs index 0317a9b7783..6787e184618 100644 --- a/src/compute/comparison/binary.rs +++ b/src/compute/comparison/binary.rs @@ -1,9 +1,10 @@ //! Comparison functions for [`BinaryArray`] use crate::compute::comparison::{finish_eq_validities, finish_neq_validities}; use crate::{ - array::{BinaryArray, BooleanArray, Offset}, + array::{BinaryArray, BooleanArray}, bitmap::Bitmap, datatypes::DataType, + offset::Offset, }; use super::super::utils::combine_validities; diff --git a/src/compute/comparison/utf8.rs b/src/compute/comparison/utf8.rs index 31983fa816c..05d84d803ff 100644 --- a/src/compute/comparison/utf8.rs +++ b/src/compute/comparison/utf8.rs @@ -1,9 +1,10 @@ //! Comparison functions for [`Utf8Array`] use crate::compute::comparison::{finish_eq_validities, finish_neq_validities}; use crate::{ - array::{BooleanArray, Offset, Utf8Array}, + array::{BooleanArray, Utf8Array}, bitmap::Bitmap, datatypes::DataType, + offset::Offset, }; use super::super::utils::combine_validities; diff --git a/src/compute/contains.rs b/src/compute/contains.rs index a1ede1dd29d..738a8e11e57 100644 --- a/src/compute/contains.rs +++ b/src/compute/contains.rs @@ -1,10 +1,11 @@ //! Declares the [`contains`] operator use crate::{ - array::{Array, BinaryArray, BooleanArray, ListArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}, bitmap::Bitmap, datatypes::DataType, error::{Error, Result}, + offset::Offset, types::NativeType, }; diff --git a/src/compute/hash.rs b/src/compute/hash.rs index 68232b4d015..d5f8370d8b4 100644 --- a/src/compute/hash.rs +++ b/src/compute/hash.rs @@ -12,9 +12,10 @@ macro_rules! new_state { } use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, datatypes::{DataType, PhysicalType, PrimitiveType}, error::{Error, Result}, + offset::Offset, types::NativeType, }; diff --git a/src/compute/length.rs b/src/compute/length.rs index c52541a8917..8cea7232894 100644 --- a/src/compute/length.rs +++ b/src/compute/length.rs @@ -21,6 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, + offset::Offset, types::NativeType, }; diff --git a/src/compute/like.rs b/src/compute/like.rs index bf363972a1a..5c736cd8b5f 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -5,11 +5,12 @@ use regex::bytes::Regex as BytesRegex; use regex::Regex; use crate::{ - array::{BinaryArray, BooleanArray, Offset, Utf8Array}, + array::{BinaryArray, BooleanArray, Utf8Array}, bitmap::Bitmap, compute::utils::combine_validities, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; #[inline] diff --git a/src/compute/regex_match.rs b/src/compute/regex_match.rs index 41cacb293bf..371eb073e46 100644 --- a/src/compute/regex_match.rs +++ b/src/compute/regex_match.rs @@ -3,11 +3,13 @@ use ahash::AHashMap; use regex::Regex; -use super::utils::combine_validities; -use crate::array::{BooleanArray, Offset, Utf8Array}; +use crate::array::{BooleanArray, Utf8Array}; use crate::bitmap::Bitmap; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; + +use super::utils::combine_validities; /// Regex matches pub fn regex_match(values: &Utf8Array, regex: &Utf8Array) -> Result { diff --git a/src/compute/sort/binary.rs b/src/compute/sort/binary.rs index 766efc678b2..cf0992b4b99 100644 --- a/src/compute/sort/binary.rs +++ b/src/compute/sort/binary.rs @@ -1,4 +1,5 @@ -use crate::array::{BinaryArray, Offset, PrimitiveArray}; +use crate::array::{BinaryArray, PrimitiveArray}; +use crate::offset::Offset; use crate::types::Index; use super::common; diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs index be0f15a1ae2..5394e370280 100644 --- a/src/compute/sort/mod.rs +++ b/src/compute/sort/mod.rs @@ -5,6 +5,7 @@ use crate::array::ord; use crate::compute::take; use crate::datatypes::*; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::{ array::*, types::{Index, NativeType}, diff --git a/src/compute/sort/row/mod.rs b/src/compute/sort/row/mod.rs index 8d4833af089..005e046fc92 100644 --- a/src/compute/sort/row/mod.rs +++ b/src/compute/sort/row/mod.rs @@ -637,9 +637,10 @@ mod tests { use super::*; use crate::{ - array::{Array, DictionaryKey, Float32Array, Int16Array, NullArray, Offset}, + array::{Array, DictionaryKey, Float32Array, Int16Array, NullArray}, compute::sort::build_compare, datatypes::DataType, + offset::Offset, types::NativeType, }; diff --git a/src/compute/sort/utf8.rs b/src/compute/sort/utf8.rs index e2e2da1bc56..0d7190eb23f 100644 --- a/src/compute/sort/utf8.rs +++ b/src/compute/sort/utf8.rs @@ -1,5 +1,5 @@ -use crate::array::{DictionaryArray, DictionaryKey}; -use crate::array::{Offset, PrimitiveArray, Utf8Array}; +use crate::array::{DictionaryArray, DictionaryKey, PrimitiveArray, Utf8Array}; +use crate::offset::Offset; use crate::types::Index; use super::common; diff --git a/src/compute/substring.rs b/src/compute/substring.rs index d879a971707..1edab3a0cb1 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -21,6 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { diff --git a/src/compute/take/binary.rs b/src/compute/take/binary.rs index 0651b6bba13..4d116d650df 100644 --- a/src/compute/take/binary.rs +++ b/src/compute/take/binary.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{Array, BinaryArray, Offset, PrimitiveArray}; +use crate::array::{Array, BinaryArray, PrimitiveArray}; +use crate::offset::Offset; use super::generic_binary::*; use super::Index; diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index ef78184f6b8..3656aebc771 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -1,7 +1,8 @@ use crate::{ - array::{GenericBinaryArray, Offset, PrimitiveArray}, + array::{GenericBinaryArray, PrimitiveArray}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, + offset::Offset, }; use super::Index; diff --git a/src/compute/take/list.rs b/src/compute/take/list.rs index 5e8b1d10e7c..6abc1d10155 100644 --- a/src/compute/take/list.rs +++ b/src/compute/take/list.rs @@ -17,8 +17,9 @@ use crate::array::{ growable::{Growable, GrowableList}, - ListArray, Offset, PrimitiveArray, + ListArray, PrimitiveArray, }; +use crate::offset::Offset; use super::Index; diff --git a/src/compute/take/utf8.rs b/src/compute/take/utf8.rs index 99fc091f491..490e76bf4b8 100644 --- a/src/compute/take/utf8.rs +++ b/src/compute/take/utf8.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{Array, Offset, PrimitiveArray, Utf8Array}; +use crate::array::{Array, PrimitiveArray, Utf8Array}; +use crate::offset::Offset; use super::generic_binary::*; use super::Index; diff --git a/src/compute/utf8.rs b/src/compute/utf8.rs index 37c1c6ff47e..2e480016ef5 100644 --- a/src/compute/utf8.rs +++ b/src/compute/utf8.rs @@ -1,9 +1,10 @@ //! Defines common maps to a [`Utf8Array`] use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; /// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array. diff --git a/src/ffi/mmap.rs b/src/ffi/mmap.rs index 4bd006393f7..32be7d763eb 100644 --- a/src/ffi/mmap.rs +++ b/src/ffi/mmap.rs @@ -1,9 +1,10 @@ use std::collections::VecDeque; use std::sync::Arc; -use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, Offset, StructArray}; +use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray}; use crate::datatypes::DataType; use crate::error::Error; +use crate::offset::Offset; use crate::io::ipc::read::{Dictionaries, OutOfSpecKind}; use crate::io::ipc::read::{IpcBuffer, Node}; diff --git a/src/io/avro/read/nested.rs b/src/io/avro/read/nested.rs index 649a702ff9c..04d9bcf43b6 100644 --- a/src/io/avro/read/nested.rs +++ b/src/io/avro/read/nested.rs @@ -2,6 +2,7 @@ use crate::array::*; use crate::bitmap::*; use crate::datatypes::*; use crate::error::*; +use crate::offset::Offset; /// Auxiliary struct #[derive(Debug)] diff --git a/src/io/avro/write/serialize.rs b/src/io/avro/write/serialize.rs index cb94b78d28b..5310400ffb8 100644 --- a/src/io/avro/write/serialize.rs +++ b/src/io/avro/write/serialize.rs @@ -3,6 +3,7 @@ use avro_schema::write::encode; use crate::bitmap::utils::ZipValidity; use crate::datatypes::{IntervalUnit, PhysicalType, PrimitiveType}; +use crate::offset::Offset; use crate::types::months_days_ns; use crate::{array::*, datatypes::DataType}; diff --git a/src/io/csv/read_utils.rs b/src/io/csv/read_utils.rs index 27bdcff163d..278dd32cb21 100644 --- a/src/io/csv/read_utils.rs +++ b/src/io/csv/read_utils.rs @@ -1,23 +1,24 @@ use chrono::Datelike; -// Ideally this trait should not be needed and both `csv` and `csv_async` crates would share -// the same `ByteRecord` struct. Unfortunately, they do not and thus we must use generics -// over this trait and materialize the generics for each struct. -pub(crate) trait ByteRecordGeneric { - fn get(&self, index: usize) -> Option<&[u8]>; -} - use crate::{ array::*, chunk::Chunk, datatypes::*, error::{Error, Result}, + offset::Offset, temporal_conversions, types::NativeType, }; use super::utils::RFC3339; +// Ideally this trait should not be needed and both `csv` and `csv_async` crates would share +// the same `ByteRecord` struct. Unfortunately, they do not and thus we must use generics +// over this trait and materialize the generics for each struct. +pub(crate) trait ByteRecordGeneric { + fn get(&self, index: usize) -> Option<&[u8]>; +} + #[inline] fn to_utf8(bytes: &[u8]) -> Option<&str> { simdutf8::basic::from_utf8(bytes).ok() diff --git a/src/io/csv/write/serialize.rs b/src/io/csv/write/serialize.rs index 6f704d1cc08..46addc378f0 100644 --- a/src/io/csv/write/serialize.rs +++ b/src/io/csv/write/serialize.rs @@ -5,13 +5,15 @@ use crate::temporal_conversions; use crate::types::NativeType; use crate::util::lexical_to_bytes_mut; use crate::{ - array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, + array::{ + Array, BinaryArray, BooleanArray, DictionaryArray, DictionaryKey, PrimitiveArray, Utf8Array, + }, datatypes::{DataType, TimeUnit}, error::Result, + offset::Offset, }; use super::super::super::iterator::{BufStreamingIterator, StreamingIterator}; -use crate::array::{DictionaryArray, DictionaryKey, Offset}; use csv_core::WriteResult; use std::any::Any; use std::fmt::{Debug, Write}; diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 3ee6c4ba288..eea120a5b2f 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -1,10 +1,11 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use crate::array::{BinaryArray, Offset}; +use crate::array::BinaryArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 7870b85a7b1..8824ed86fe1 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -2,10 +2,11 @@ use std::collections::VecDeque; use std::convert::TryInto; use std::io::{Read, Seek}; -use crate::array::{ListArray, Offset}; +use crate::array::ListArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index f57e51ecc5a..1ff056d6f8c 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -1,10 +1,11 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use crate::array::{Offset, Utf8Array}; +use crate::array::Utf8Array; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 28647bd9111..7737cbca9cd 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -2,7 +2,8 @@ use arrow_format::ipc; use crate::{ - array::*, bitmap::Bitmap, datatypes::PhysicalType, trusted_len::TrustedLen, types::NativeType, + array::*, bitmap::Bitmap, datatypes::PhysicalType, offset::Offset, trusted_len::TrustedLen, + types::NativeType, }; use super::super::compression; diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index ea5d25a0a5d..1d3997b1da1 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -12,6 +12,7 @@ use crate::{ chunk::Chunk, datatypes::{DataType, Field, IntervalUnit, Schema}, error::Error, + offset::Offset, types::{f16, NativeType}, }; diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index aa323109e9b..9bf08fc8968 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -6,6 +6,7 @@ use streaming_iterator::StreamingIterator; use crate::bitmap::utils::ZipValidity; use crate::datatypes::TimeUnit; use crate::io::iterator::BufStreamingIterator; +use crate::offset::Offset; use crate::temporal_conversions::{ date32_to_date, date64_to_date, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 90cd709cbf9..42fe220b96f 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -10,6 +10,7 @@ use crate::{ datatypes::{DataType, PhysicalType, PrimitiveType, Schema}, error::{Error, Result}, io::ipc::IpcField, + offset::Offset, types::{days_ms, i256, months_days_ns, NativeType}, }; diff --git a/src/io/odbc/write/serialize.rs b/src/io/odbc/write/serialize.rs index 01767e6d52f..7f2fc18aa7c 100644 --- a/src/io/odbc/write/serialize.rs +++ b/src/io/odbc/write/serialize.rs @@ -4,6 +4,7 @@ use crate::array::*; use crate::bitmap::Bitmap; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::types::NativeType; use super::super::api; diff --git a/src/io/orc/read/mod.rs b/src/io/orc/read/mod.rs index ca6ce7354c3..4a365078236 100644 --- a/src/io/orc/read/mod.rs +++ b/src/io/orc/read/mod.rs @@ -1,12 +1,11 @@ //! APIs to read from [ORC format](https://orc.apache.org). use std::io::Read; -use crate::array::{ - Array, BinaryArray, BooleanArray, Int64Array, Offset, PrimitiveArray, Utf8Array, -}; +use crate::array::{Array, BinaryArray, BooleanArray, Int64Array, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::Error; +use crate::offset::Offset; use crate::types::NativeType; use orc_format::proto::stream::Kind; diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 8790b43ea45..0a2aa098c45 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -9,11 +9,12 @@ use parquet2::{ }; use crate::{ - array::{Array, BinaryArray, Offset, Utf8Array}, + array::{Array, BinaryArray, Utf8Array}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; use super::super::utils::{ diff --git a/src/io/parquet/read/deserialize/binary/dictionary.rs b/src/io/parquet/read/deserialize/binary/dictionary.rs index 3000a7ca7d2..5cf3c07d97b 100644 --- a/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/binary/dictionary.rs @@ -3,11 +3,12 @@ use std::collections::VecDeque; use parquet2::page::DictPage; use crate::{ - array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Offset, Utf8Array}, + array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array}, bitmap::MutableBitmap, datatypes::{DataType, PhysicalType}, error::Result, io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}, + offset::Offset, }; use super::super::Pages; diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs index 8c0f5ef419e..2d345140db7 100644 --- a/src/io/parquet/read/deserialize/binary/nested.rs +++ b/src/io/parquet/read/deserialize/binary/nested.rs @@ -7,8 +7,8 @@ use parquet2::{ }; use crate::{ - array::Offset, bitmap::MutableBitmap, datatypes::DataType, error::Result, - io::parquet::read::Pages, + bitmap::MutableBitmap, datatypes::DataType, error::Result, io::parquet::read::Pages, + offset::Offset, }; use super::super::utils::MaybeNext; diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index a47cb967a55..d886c1bfae6 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -1,4 +1,4 @@ -use crate::array::Offset; +use crate::offset::Offset; use super::super::utils::Pushable; diff --git a/src/io/parquet/read/statistics/binary.rs b/src/io/parquet/read/statistics/binary.rs index 1786477e6bb..aeb43a6b3e0 100644 --- a/src/io/parquet/read/statistics/binary.rs +++ b/src/io/parquet/read/statistics/binary.rs @@ -1,7 +1,8 @@ -use crate::array::{MutableArray, MutableBinaryArray, Offset}; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; +use crate::array::{MutableArray, MutableBinaryArray}; use crate::error::Result; +use crate::offset::Offset; pub(super) fn push( from: Option<&dyn ParquetStatistics>, diff --git a/src/io/parquet/read/statistics/utf8.rs b/src/io/parquet/read/statistics/utf8.rs index 7a447e2334f..da9fcb6e111 100644 --- a/src/io/parquet/read/statistics/utf8.rs +++ b/src/io/parquet/read/statistics/utf8.rs @@ -1,7 +1,8 @@ -use crate::array::{MutableArray, MutableUtf8Array, Offset}; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; +use crate::array::{MutableArray, MutableUtf8Array}; use crate::error::Result; +use crate::offset::Offset; pub(super) fn push( from: Option<&dyn ParquetStatistics>, diff --git a/src/io/parquet/write/binary/basic.rs b/src/io/parquet/write/binary/basic.rs index 58156c901e6..f2a5071c14d 100644 --- a/src/io/parquet/write/binary/basic.rs +++ b/src/io/parquet/write/binary/basic.rs @@ -8,10 +8,11 @@ use parquet2::{ use super::super::utils; use super::super::WriteOptions; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, bitmap::Bitmap, error::{Error, Result}, io::parquet::read::schema::is_nullable, + offset::Offset, }; pub(crate) fn encode_plain( diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 55f5d2ef247..950ea4190ca 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -6,8 +6,9 @@ use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; use crate::io::parquet::write::Nested; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, error::Result, + offset::Offset, }; pub fn array_to_page( diff --git a/src/io/parquet/write/nested/def.rs b/src/io/parquet/write/nested/def.rs index ea62c1c14dc..eb945a4c683 100644 --- a/src/io/parquet/write/nested/def.rs +++ b/src/io/parquet/write/nested/def.rs @@ -1,4 +1,4 @@ -use crate::{array::Offset, bitmap::Bitmap}; +use crate::{bitmap::Bitmap, offset::Offset}; use super::super::pages::{ListNested, Nested}; use super::rep::num_values; diff --git a/src/io/parquet/write/nested/mod.rs b/src/io/parquet/write/nested/mod.rs index d9f01e3f487..5f6cd4d3524 100644 --- a/src/io/parquet/write/nested/mod.rs +++ b/src/io/parquet/write/nested/mod.rs @@ -3,7 +3,7 @@ mod rep; use parquet2::{encoding::hybrid_rle::encode_u32, read::levels::get_bit_width, write::Version}; -use crate::{array::Offset, error::Result}; +use crate::{error::Result, offset::Offset}; use super::Nested; diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index caaa92866be..2259647d79c 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -1,10 +1,11 @@ use parquet2::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType}; use parquet2::{page::Page, write::DynIter}; -use crate::array::{ListArray, Offset, StructArray}; +use crate::array::{ListArray, StructArray}; use crate::bitmap::Bitmap; use crate::datatypes::PhysicalType; use crate::io::parquet::read::schema::is_nullable; +use crate::offset::Offset; use crate::{ array::Array, error::{Error, Result}, diff --git a/src/io/parquet/write/utf8/basic.rs b/src/io/parquet/write/utf8/basic.rs index c1e7b9321bc..1f1aeaab8fd 100644 --- a/src/io/parquet/write/utf8/basic.rs +++ b/src/io/parquet/write/utf8/basic.rs @@ -9,9 +9,10 @@ use super::super::binary::{encode_delta, ord_binary}; use super::super::utils; use super::super::WriteOptions; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, error::{Error, Result}, io::parquet::read::schema::is_nullable, + offset::Offset, }; pub(crate) fn encode_plain( diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index 42babd46cd7..2792ef35712 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -6,8 +6,9 @@ use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; use crate::io::parquet::write::Nested; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, error::Result, + offset::Offset, }; pub fn array_to_page( diff --git a/src/lib.rs b/src/lib.rs index 64771b954cf..bef2e6e53c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,7 @@ pub mod error; #[cfg_attr(docsrs, doc(cfg(feature = "io_ipc")))] pub mod mmap; +pub mod offset; pub mod scalar; pub mod trusted_len; pub mod types; diff --git a/src/offset.rs b/src/offset.rs new file mode 100644 index 00000000000..edca7dc8b38 --- /dev/null +++ b/src/offset.rs @@ -0,0 +1,2 @@ +//! Contains the declaration of [`Offset`] +pub use crate::types::Offset; diff --git a/src/scalar/binary.rs b/src/scalar/binary.rs index 70a9be40374..cfbdbd04b0a 100644 --- a/src/scalar/binary.rs +++ b/src/scalar/binary.rs @@ -1,4 +1,4 @@ -use crate::{array::*, datatypes::DataType}; +use crate::{datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/scalar/list.rs b/src/scalar/list.rs index 438e13f7f55..98ee497d949 100644 --- a/src/scalar/list.rs +++ b/src/scalar/list.rs @@ -1,6 +1,6 @@ use std::any::Any; -use crate::{array::*, datatypes::DataType}; +use crate::{array::*, datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/scalar/utf8.rs b/src/scalar/utf8.rs index f62581eccbc..73ea98b729e 100644 --- a/src/scalar/utf8.rs +++ b/src/scalar/utf8.rs @@ -1,4 +1,4 @@ -use crate::{array::*, datatypes::DataType}; +use crate::{datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index af643eaf7fd..8eae7213fc8 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -7,8 +7,9 @@ use chrono::{ use crate::error::Result; use crate::{ - array::{Offset, PrimitiveArray, Utf8Array}, + array::{PrimitiveArray, Utf8Array}, error::Error, + offset::Offset, }; use crate::{ datatypes::{DataType, TimeUnit}, diff --git a/src/util/bench_util.rs b/src/util/bench_util.rs index 3ba63ce37be..6ac87ed3f6a 100644 --- a/src/util/bench_util.rs +++ b/src/util/bench_util.rs @@ -3,7 +3,7 @@ use rand::distributions::{Alphanumeric, Distribution, Standard}; use rand::{rngs::StdRng, Rng, SeedableRng}; -use crate::{array::*, types::NativeType}; +use crate::{array::*, offset::Offset, types::NativeType}; /// Returns fixed seedable RNG pub fn seedable_rng() -> StdRng { diff --git a/tests/it/array/equal/utf8.rs b/tests/it/array/equal/utf8.rs index 2be9ebef83c..28e13d4a2ec 100644 --- a/tests/it/array/equal/utf8.rs +++ b/tests/it/array/equal/utf8.rs @@ -1,4 +1,5 @@ use arrow2::array::*; +use arrow2::offset::Offset; use super::{binary_cases, test_equal}; diff --git a/tests/it/compute/length.rs b/tests/it/compute/length.rs index 9bb37576956..0a6b5e51e8e 100644 --- a/tests/it/compute/length.rs +++ b/tests/it/compute/length.rs @@ -1,6 +1,7 @@ use arrow2::array::*; use arrow2::compute::length::*; use arrow2::datatypes::*; +use arrow2::offset::Offset; fn length_test_string() { vec![ diff --git a/tests/it/compute/regex_match.rs b/tests/it/compute/regex_match.rs index 141a87ad3e6..66f28d03b9b 100644 --- a/tests/it/compute/regex_match.rs +++ b/tests/it/compute/regex_match.rs @@ -1,6 +1,7 @@ -use arrow2::array::{BooleanArray, Offset, Utf8Array}; +use arrow2::array::{BooleanArray, Utf8Array}; use arrow2::compute::regex_match::*; use arrow2::error::Result; +use arrow2::offset::Offset; fn test_generic, &Utf8Array) -> Result>( lhs: Vec<&str>, diff --git a/tests/it/compute/substring.rs b/tests/it/compute/substring.rs index 365615cd51f..5b76a0ac348 100644 --- a/tests/it/compute/substring.rs +++ b/tests/it/compute/substring.rs @@ -1,4 +1,4 @@ -use arrow2::{array::*, compute::substring::*, error::Result}; +use arrow2::{array::*, compute::substring::*, error::Result, offset::Offset}; fn with_nulls_utf8() -> Result<()> { let cases = vec![ diff --git a/tests/it/compute/utf8.rs b/tests/it/compute/utf8.rs index 864dc0eca27..e9d8613ddeb 100644 --- a/tests/it/compute/utf8.rs +++ b/tests/it/compute/utf8.rs @@ -1,4 +1,4 @@ -use arrow2::{array::*, compute::utf8::*, error::Result}; +use arrow2::{array::*, compute::utf8::*, error::Result, offset::Offset}; fn with_nulls_utf8_lower() -> Result<()> { let cases = vec![ diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 9097d32fb30..06afa74fc95 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -10,6 +10,7 @@ use arrow2::{ io::parquet::read as p_read, io::parquet::read::statistics::*, io::parquet::write::*, + offset::Offset, types::{days_ms, NativeType}, }; From 832a02fbcfa3ef934599735f52a8fba35b1770e7 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 4 Dec 2022 07:48:28 +0000 Subject: [PATCH 2/5] Added Offsets and OffsetsBuffer --- benches/iter_list.rs | 5 +- src/array/binary/ffi.rs | 15 +- src/array/binary/mod.rs | 231 +++------ src/array/binary/mutable.rs | 69 +-- src/array/binary/mutable_values.rs | 119 ++--- src/array/dictionary/mutable.rs | 15 +- src/array/growable/binary.rs | 28 +- src/array/growable/list.rs | 77 +-- src/array/growable/utf8.rs | 31 +- src/array/growable/utils.rs | 10 - src/array/list/ffi.rs | 13 +- src/array/list/mod.rs | 118 +---- src/array/list/mutable.rs | 156 ++---- src/array/map/ffi.rs | 11 +- src/array/map/mod.rs | 36 +- src/array/physical_binary.rs | 140 +----- src/array/specification.rs | 17 - src/array/utf8/ffi.rs | 15 +- src/array/utf8/mod.rs | 72 ++- src/array/utf8/mutable.rs | 19 +- src/array/utf8/mutable_values.rs | 45 +- src/compute/aggregate/memory.rs | 2 +- src/compute/cast/binary_to.rs | 22 +- src/compute/cast/mod.rs | 31 +- src/compute/cast/primitive_to.rs | 12 +- src/compute/cast/utf8_to.rs | 27 +- src/compute/length.rs | 1 + src/compute/substring.rs | 14 +- src/compute/take/generic_binary.rs | 56 ++- src/io/avro/read/nested.rs | 37 +- src/io/avro/write/serialize.rs | 2 + src/io/ipc/read/array/binary.rs | 2 +- src/io/ipc/read/array/list.rs | 2 +- src/io/ipc/read/array/map.rs | 2 +- src/io/ipc/read/array/utf8.rs | 2 +- src/io/ipc/write/serialize.rs | 15 +- src/io/json/read/deserialize.rs | 80 ++-- src/io/json/write/serialize.rs | 2 +- src/io/json_integration/read/array.rs | 15 +- src/io/odbc/read/deserialize.rs | 12 +- src/io/odbc/write/serialize.rs | 2 + src/io/orc/read/mod.rs | 37 +- .../parquet/read/deserialize/binary/basic.rs | 20 +- .../read/deserialize/binary/dictionary.rs | 5 +- .../parquet/read/deserialize/binary/utils.rs | 53 +-- src/io/parquet/read/deserialize/mod.rs | 8 +- src/io/parquet/read/deserialize/nested.rs | 2 +- src/io/parquet/read/statistics/list.rs | 11 +- src/io/parquet/read/statistics/map.rs | 2 +- src/io/parquet/write/binary/basic.rs | 2 +- src/io/parquet/write/pages.rs | 6 +- src/io/parquet/write/utf8/basic.rs | 2 +- src/offset.rs | 445 ++++++++++++++++++ src/types/index.rs | 8 + tests/it/array/binary/mod.rs | 19 +- tests/it/array/binary/mutable.rs | 16 +- tests/it/array/binary/mutable_values.rs | 29 +- tests/it/array/binary/to_mutable.rs | 8 +- tests/it/array/equal/list.rs | 5 +- tests/it/array/list/mod.rs | 10 +- tests/it/array/list/mutable.rs | 6 +- tests/it/array/map/mod.rs | 7 +- tests/it/array/utf8/mod.rs | 76 ++- tests/it/array/utf8/mutable.rs | 15 +- tests/it/array/utf8/mutable_values.rs | 31 +- tests/it/array/utf8/to_mutable.rs | 12 +- tests/it/compute/take.rs | 14 +- tests/it/ffi/data.rs | 6 +- tests/it/io/avro/write.rs | 4 +- tests/it/io/json/write.rs | 2 +- tests/it/io/ndjson/mod.rs | 3 +- tests/it/io/parquet/mod.rs | 17 +- 72 files changed, 1160 insertions(+), 1301 deletions(-) diff --git a/benches/iter_list.rs b/benches/iter_list.rs index ba576cb67b0..f77c9536e6d 100644 --- a/benches/iter_list.rs +++ b/benches/iter_list.rs @@ -16,8 +16,7 @@ fn add_benchmark(c: &mut Criterion) { let values = Buffer::from_iter(0..size as i32); let values = PrimitiveArray::::from_data(DataType::Int32, values, None); - let mut offsets = (0..size as i32).step_by(2).collect::>(); - offsets.push(size as i32); + let offsets = (0..=size as i32).step_by(2).collect::>(); let validity = (0..(offsets.len() - 1)) .map(|i| i % 4 == 0) @@ -26,7 +25,7 @@ fn add_benchmark(c: &mut Criterion) { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - offsets.into(), + offsets.try_into().unwrap(), Box::new(values), Some(validity), ); diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index f592773f56e..6f971c4226f 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -2,7 +2,7 @@ use crate::{ array::{FromFfi, ToFfi}, bitmap::align, ffi, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use crate::error::Result; @@ -13,13 +13,13 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), Some(self.values.as_ptr().cast::()), ] } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -32,7 +32,7 @@ unsafe impl ToFfi for BinaryArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -59,8 +59,9 @@ impl FromFfi for BinaryArray { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2) }?; - Ok(Self::from_data_unchecked( - data_type, offsets, values, validity, - )) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new(data_type, offsets, values, validity)) } } diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 612d988215e..b7f72cdbb20 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -6,16 +6,13 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::Error, - offset::Offset, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; use either::Either; -use super::{ - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, GenericBinaryArray, -}; +use super::{specification::try_check_offsets_bounds, Array, GenericBinaryArray}; mod ffi; pub(super) mod fmt; @@ -60,7 +57,7 @@ pub use mutable::*; #[derive(Clone)] pub struct BinaryArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -70,23 +67,22 @@ impl BinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(offsets.buffer(), values.len())?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -131,7 +127,7 @@ impl BinaryArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the element at index `i` @@ -149,8 +145,8 @@ impl BinaryArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let start = self.offsets.buffer().get_unchecked(i).to_usize(); + let end = self.offsets.buffer().get_unchecked(i + 1).to_usize(); // soundness: the invariant of the struct self.values.get_unchecked(start..end) @@ -170,7 +166,7 @@ impl BinaryArray { /// Returns the offsets of this [`BinaryArray`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -250,90 +246,78 @@ impl BinaryArray { if let Some(bitmap) = self.validity { match bitmap.into_mut() { // Safety: invariants are preserved - Left(bitmap) => Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - self.values, - Some(bitmap), - ) - }), + Left(bitmap) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + Some(bitmap), + )), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { - (None, None) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - self.values, - Some(mutable_bitmap.into()), - ) - }) - } - (None, Some(offsets)) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - offsets.into(), - self.values, - Some(mutable_bitmap.into()), - ) - }) - } - (Some(mutable_values), None) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - mutable_values.into(), - Some(mutable_bitmap.into()), - ) - }) - } - (Some(values), Some(offsets)) => Right(unsafe { - MutableBinaryArray::from_data( - self.data_type, - offsets, - values, - Some(mutable_bitmap), - ) - }), + (None, None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + Some(mutable_bitmap.into()), + )), + (None, Some(offsets)) => Left(BinaryArray::new( + self.data_type, + offsets.into(), + self.values, + Some(mutable_bitmap.into()), + )), + (Some(mutable_values), None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + mutable_values.into(), + Some(mutable_bitmap.into()), + )), + (Some(values), Some(offsets)) => Right(MutableBinaryArray::from_data( + self.data_type, + offsets, + values, + Some(mutable_bitmap), + )), }, } } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { - (None, None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, self.values, None) - }), - (None, Some(offsets)) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, offsets.into(), self.values, None) - }), - (Some(values), None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, values.into(), None) - }), - (Some(values), Some(offsets)) => Right(unsafe { - MutableBinaryArray::from_data(self.data_type, offsets, values, None) - }), + (None, None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + None, + )), + (None, Some(offsets)) => Left(BinaryArray::new( + self.data_type, + offsets.into(), + self.values, + None, + )), + (Some(values), None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + values.into(), + None, + )), + (Some(values), Some(offsets)) => Right(MutableBinaryArray::from_data( + self.data_type, + offsets, + values, + None, + )), } } } /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. pub fn new_empty(data_type: DataType) -> Self { - Self::new( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) + Self::new(data_type, OffsetsBuffer::new(), Buffer::new(), None) } /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`. @@ -341,7 +325,7 @@ impl BinaryArray { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -356,72 +340,16 @@ impl BinaryArray { } } - /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - return Err(Error::oos( - "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary", - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - /// Alias for unwrapping [`Self::try_new`] pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { Self::try_new(data_type, offsets, values, validity).unwrap() } - /// Alias for unwrapping [`Self::try_new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } - /// Returns a [`BinaryArray`] from an iterator of trusted length. /// /// The [`BinaryArray`] is guaranteed to not have a validity @@ -487,23 +415,10 @@ impl BinaryArray { unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } } - /// Alias for [`Self::new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn from_data_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new_unchecked(data_type, offsets, values, validity) - } - /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -558,6 +473,6 @@ unsafe impl GenericBinaryArray for BinaryArray { #[inline] fn offsets(&self) -> &[O] { - self.offsets() + self.offsets().buffer() } } diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 36f31eee5a5..0f8655e33bf 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -8,7 +8,7 @@ use crate::{ }, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -54,15 +54,14 @@ impl MutableBinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -80,26 +79,6 @@ impl MutableBinaryArray { Ok(Self { values, validity }) } - /// Create a [`MutableBinaryArray`] out of its inner attributes. - /// # Safety - /// The caller must ensure that every value between offsets is a valid utf8. - /// # Panics - /// This function panics iff: - /// * The `offsets` and `values` are inconsistent - /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Vec, - values: Vec, - validity: Option, - ) -> Self { - let values = MutableBinaryValuesArray::new_unchecked(data_type, offsets, values); - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - Self { values, validity } - } - /// Creates a new [`MutableBinaryArray`] from a slice of optional `&[u8]`. // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { @@ -185,7 +164,7 @@ impl MutableBinaryArray { /// Equivalent to `Self::try_new(...).unwrap()` pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -200,7 +179,7 @@ impl MutableBinaryArray { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } @@ -225,34 +204,24 @@ impl MutableArray for MutableBinaryArray { } fn as_box(&mut self) -> Box { - // Safety: - // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); - unsafe { - BinaryArray::new_unchecked( - data_type, - offsets.into(), - values.into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - } + BinaryArray::new( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) .boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); - unsafe { - BinaryArray::new_unchecked( - data_type, - offsets.into(), - values.into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - } + BinaryArray::new( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) .arced() } @@ -323,7 +292,7 @@ impl MutableBinaryArray { pub unsafe fn from_trusted_len_values_iter_unchecked, I: Iterator>( iterator: I, ) -> Self { - let (offsets, values) = unsafe { trusted_len_values_iter(iterator) }; + let (offsets, values) = trusted_len_values_iter(iterator); Self::from_data(Self::default_data_type(), offsets, values, None) } diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 1d608b7403f..633c34d07ef 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -2,13 +2,13 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets_minimal, try_check_offsets}, - Array, ArrayAccessor, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, + specification::try_check_offsets_bounds, Array, ArrayAccessor, ArrayValuesIter, + MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -20,38 +20,24 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableBinaryValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } impl From> for BinaryArray { fn from(other: MutableBinaryValuesArray) -> Self { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. - unsafe { - BinaryArray::::from_data_unchecked( - other.data_type, - other.offsets.into(), - other.values.into(), - None, - ) - } + BinaryArray::::new( + other.data_type, + other.offsets.into(), + other.values.into(), + None, + ) } } impl From> for MutableBinaryArray { fn from(other: MutableBinaryValuesArray) -> Self { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `MutableBinaryArray` - unsafe { - MutableBinaryArray::::new_unchecked( - other.data_type, - other.offsets, - other.values, - None, - ) - } + MutableBinaryArray::::from_data(other.data_type, other.offsets, other.values, None) } } @@ -66,7 +52,7 @@ impl MutableBinaryValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -75,13 +61,13 @@ impl MutableBinaryValuesArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { - try_check_offsets(&offsets, values.len())?; + /// This function is `O(1)` + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { + try_check_offsets_bounds(offsets.as_slice(), values.len())?; + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary", @@ -95,31 +81,6 @@ impl MutableBinaryValuesArray { }) } - /// Returns a [`MutableBinaryValuesArray`] created from its internal representation. - /// - /// # Panic - /// This function does not panic iff: - /// * The last offset is equal to the values' length. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is safe iff: - /// * the offsets are monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { - check_offsets_minimal(&offsets, values.len()); - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary") - } - - Self { - data_type, - offsets, - values, - } - } - /// Returns the default [`DataType`] of this container: [`DataType::Utf8`] or [`DataType::LargeUtf8`] /// depending on the generic [`Offset`]. pub fn default_data_type() -> DataType { @@ -133,12 +94,9 @@ impl MutableBinaryValuesArray { /// Initializes a new [`MutableBinaryValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -151,26 +109,26 @@ impl MutableBinaryValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } /// Reserves `additional` elements and `additional_values` on the values. #[inline] pub fn reserve(&mut self, additional: usize, additional_values: usize) { - self.offsets.reserve(additional + 1); + self.offsets.reserve(additional); self.values.reserve(additional_values); } /// Returns the capacity in number of items pub fn capacity(&self) -> usize { - self.offsets.capacity() - 1 + self.offsets.capacity() } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -188,7 +146,7 @@ impl MutableBinaryValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); Some(value.to_vec()) } @@ -208,8 +166,8 @@ impl MutableBinaryValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let start = self.offsets.as_slice().get_unchecked(i).to_usize(); + let end = self.offsets.as_slice().get_unchecked(i + 1).to_usize(); // soundness: the invariant of the struct self.values.get_unchecked(start..end) @@ -227,7 +185,7 @@ impl MutableBinaryValuesArray { } /// Extract the low-end APIs from the [`MutableBinaryValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -242,21 +200,13 @@ impl MutableArray for MutableBinaryValuesArray { } fn as_box(&mut self) -> Box { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .boxed() + BinaryArray::new(data_type, offsets.into(), values.into(), None).boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .arced() + BinaryArray::new(data_type, offsets.into(), values.into(), None).arced() } fn data_type(&self) -> &DataType { @@ -288,8 +238,7 @@ impl MutableArray for MutableBinaryValuesArray { impl> FromIterator

for MutableBinaryValuesArray { fn from_iter>(iter: I) -> Self { let (offsets, values) = values_iter(iter.into_iter()); - // soundness: T: AsRef<[u8]> and offsets are monotonically increasing - unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) } + Self::try_new(Self::default_data_type(), offsets, values).unwrap() } } @@ -349,9 +298,7 @@ impl MutableBinaryValuesArray { I: Iterator, { let (offsets, values) = trusted_len_values_iter(iterator); - - // soundness: offsets are monotonically increasing - Self::new_unchecked(Self::default_data_type(), offsets, values) + Self::try_new(Self::default_data_type(), offsets, values).unwrap() } /// Returns a new [`MutableBinaryValuesArray`] from an iterator. @@ -388,11 +335,7 @@ impl> TryPush for MutableBinaryValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } @@ -413,6 +356,6 @@ unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableBinaryValuesArray { impl TryExtendFromSelf for MutableBinaryValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 665d6fc9234..444de34bcc4 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -157,15 +157,12 @@ impl MutableDictionaryArray { } fn take_into(&mut self) -> DictionaryArray { - // Safety - the invariant of this struct ensures that this is up-held - unsafe { - DictionaryArray::::try_new( - self.data_type.clone(), - std::mem::take(&mut self.keys).into(), - self.values.as_box(), - ) - .unwrap() - } + DictionaryArray::::try_new( + self.data_type.clone(), + std::mem::take(&mut self.keys).into(), + self.values.as_box(), + ) + .unwrap() } } diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index fd91590a25d..aebfb1580cd 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -4,11 +4,11 @@ use crate::{ array::{Array, BinaryArray}, bitmap::MutableBitmap, datatypes::DataType, - offset::Offset, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -18,8 +18,7 @@ pub struct GrowableBinary<'a, O: Offset> { data_type: DataType, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -41,16 +40,11 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, data_type, values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -74,18 +68,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values - extend_offset_values::(&mut self.values, offsets, values, start, len); + extend_offset_values::(&mut self.values, offsets.buffer(), values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index 3fcbe3c4539..bc78a2d8e86 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -3,12 +3,12 @@ use std::sync::Arc; use crate::{ array::{Array, ListArray}, bitmap::MutableBitmap, - offset::Offset, + offset::{Offset, Offsets}, }; use super::{ make_growable, - utils::{build_extend_null_bits, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, ExtendNullBits}, Growable, }; @@ -21,37 +21,15 @@ fn extend_offset_values( let array = growable.arrays[index]; let offsets = array.offsets(); - if array.null_count() == 0 { - // offsets - extend_offsets::( - &mut growable.offsets, - &mut growable.last_offset, - &offsets[start..start + len + 1], - ); - - let end = offsets[start + len].to_usize(); - let start = offsets[start].to_usize(); - let len = end - start; - growable.values.extend(index, start, len) - } else { - growable.offsets.reserve(len); - - let new_offsets = &mut growable.offsets; - let inner_values = &mut growable.values; - let last_offset = &mut growable.last_offset; - (start..start + len).for_each(|i| { - if array.is_valid(i) { - let len = offsets[i + 1] - offsets[i]; - // compute the new offset - *last_offset += len; - - // append value - inner_values.extend(index, offsets[i].to_usize(), len.to_usize()); - } - // append offset - new_offsets.push(*last_offset); - }) - } + growable + .offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + + let end = offsets.buffer()[start + len].to_usize(); + let start = offsets.buffer()[start].to_usize(); + let len = end - start; + growable.values.extend(index, start, len); } /// Concrete [`Growable`] for the [`ListArray`]. @@ -59,8 +37,7 @@ pub struct GrowableList<'a, O: Offset> { arrays: Vec<&'a ListArray>, validity: MutableBitmap, values: Box + 'a>, - offsets: Vec, - last_offset: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -86,16 +63,11 @@ impl<'a, O: Offset> GrowableList<'a, O> { .collect::>(); let values = make_growable(&inner, use_validity, 0); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, - offsets, + offsets: Offsets::with_capacity(capacity), values, validity: MutableBitmap::with_capacity(capacity), - last_offset: O::default(), extend_null_bits, } } @@ -105,20 +77,12 @@ impl<'a, O: Offset> GrowableList<'a, O> { let offsets = std::mem::take(&mut self.offsets); let values = self.values.as_box(); - #[cfg(debug_assertions)] - { - crate::array::specification::try_check_offsets(&offsets, values.len()).unwrap(); - } - - // Safety - the invariant of this struct ensures that this is up-held - unsafe { - ListArray::::new_unchecked( - self.arrays[0].data_type().clone(), - offsets.into(), - values, - validity.into(), - ) - } + ListArray::::new( + self.arrays[0].data_type().clone(), + offsets.into(), + values, + validity.into(), + ) } } @@ -129,8 +93,7 @@ impl<'a, O: Offset> Growable<'a> for GrowableList<'a, O> { } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.last_offset); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index eed8ba30159..0aee209378c 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -2,12 +2,12 @@ use std::sync::Arc; use crate::{ array::{Array, Utf8Array}, - offset::Offset, bitmap::MutableBitmap, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -16,8 +16,7 @@ pub struct GrowableUtf8<'a, O: Offset> { arrays: Vec<&'a Utf8Array>, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -37,15 +36,10 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays: arrays.to_vec(), values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -58,7 +52,8 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { #[cfg(debug_assertions)] { - crate::array::specification::try_check_offsets_and_utf8(&offsets, &values).unwrap(); + crate::array::specification::try_check_offsets_and_utf8(offsets.as_slice(), &values) + .unwrap(); } unsafe { @@ -81,18 +76,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values - extend_offset_values::(&mut self.values, offsets, values, start, len); + extend_offset_values::(&mut self.values, offsets.as_slice(), values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index d06c1116d48..06a85cd9ad4 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,15 +1,5 @@ use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; -pub(super) fn extend_offsets(buffer: &mut Vec, last_offset: &mut T, offsets: &[T]) { - buffer.reserve(offsets.len() - 1); - offsets.windows(2).for_each(|offsets| { - // compute the new offset - let length = offsets[1] - offsets[0]; - *last_offset += length; - buffer.push(*last_offset); - }); -} - // function used to extend nulls from arrays. This function's lifetime is bound to the array // because it reads nulls from it. pub(super) type ExtendNullBits<'a> = Box; diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index 9d0b19a85e0..2b6be75e782 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,6 +1,6 @@ use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; -use crate::offset::Offset; +use crate::offset::{Offset, OffsetsBuffer}; use super::super::{ffi::ToFfi, Array}; use super::ListArray; @@ -9,7 +9,7 @@ unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), ] } @@ -18,7 +18,7 @@ unsafe impl ToFfi for ListArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -31,7 +31,7 @@ unsafe impl ToFfi for ListArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -58,6 +58,9 @@ impl FromFfi for ListArray { let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?; - Ok(Self::from_data(data_type, offsets, values, validity)) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new(data_type, offsets, values, validity)) } } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 7740307799d..9a4c82e2a02 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -1,17 +1,12 @@ use crate::{ bitmap::Bitmap, - buffer::Buffer, datatypes::{DataType, Field}, error::Error, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use std::sync::Arc; -use super::{ - new_empty_array, - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, -}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; mod ffi; pub(super) mod fmt; @@ -24,7 +19,7 @@ pub use mutable::*; #[derive(Clone)] pub struct ListArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, } @@ -34,24 +29,23 @@ impl ListArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(offsets.buffer(), values.len())?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -78,16 +72,15 @@ impl ListArray { /// /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -97,7 +90,7 @@ impl ListArray { /// Alias of `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -107,7 +100,7 @@ impl ListArray { /// Returns a new empty [`ListArray`]. pub fn new_empty(data_type: DataType) -> Self { let values = new_empty_array(Self::get_child_type(&data_type).clone()); - Self::new(data_type, Buffer::from(vec![O::zero()]), values, None) + Self::new(data_type, OffsetsBuffer::default(), values, None) } /// Returns a new null [`ListArray`]. @@ -116,7 +109,7 @@ impl ListArray { let child = Self::get_child_type(&data_type).clone(); Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::zero(); 1 + length].try_into().unwrap(), new_empty_array(child), Some(Bitmap::new_zeroed(length)), ) @@ -133,77 +126,6 @@ impl ListArray { } } -// unsafe construtors -impl ListArray { - /// Creates a new [`ListArray`]. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - let child_data_type = Self::try_get_child(&data_type)?.data_type(); - let values_data_type = values.data_type(); - if child_data_type != values_data_type { - return Err(Error::oos( - format!("ListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - - /// Creates a new [`ListArray`]. - /// - /// # Panics - /// This function panics iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } -} - impl ListArray { /// Returns a slice of this [`ListArray`]. /// # Panics @@ -259,14 +181,14 @@ impl ListArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the element at index `i` #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets[i]; - let offset_1 = self.offsets[i + 1]; + let offset = self.offsets.buffer()[i]; + let offset_1 = self.offsets.buffer()[i + 1]; let length = (offset_1 - offset).to_usize(); // Safety: @@ -280,8 +202,8 @@ impl ListArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); + let offset = *self.offsets.buffer().get_unchecked(i); + let offset_1 = *self.offsets.buffer().get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); self.values.slice_unchecked(offset.to_usize(), length) @@ -295,7 +217,7 @@ impl ListArray { /// The offsets [`Buffer`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 97785a01740..98280dc89f7 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -2,14 +2,13 @@ use std::sync::Arc; use crate::{ array::{ - physical_binary::{extend_validity, try_extend_offsets}, - specification::try_check_offsets, - Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush, + physical_binary::extend_validity, Array, MutableArray, TryExtend, TryExtendFromSelf, + TryPush, }, bitmap::MutableBitmap, datatypes::{DataType, Field}, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -19,7 +18,7 @@ use super::ListArray; #[derive(Debug, Clone)] pub struct MutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: M, validity: Option, } @@ -37,8 +36,7 @@ impl MutableListArray { let values = M::default(); let data_type = ListArray::::default_datatype(values.data_type().clone()); - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); Self { data_type, offsets, @@ -56,16 +54,12 @@ impl Default for MutableListArray { impl From> for ListArray { fn from(mut other: MutableListArray) -> Self { - // Safety: - // MutableListArray has monotonically increasing offsets - unsafe { - ListArray::new_unchecked( - other.data_type, - other.offsets.into(), - other.values.as_box(), - other.validity.map(|x| x.into()), - ) - } + ListArray::new( + other.data_type, + other.offsets.into(), + other.values.as_box(), + other.validity.map(|x| x.into()), + ) } } @@ -113,16 +107,14 @@ where extend_validity(self.len(), &mut self.validity, &other.validity); self.values.try_extend_from_self(&other.values)?; - - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } impl MutableListArray { /// Creates a new [`MutableListArray`] from a [`MutableArray`] and capacity. pub fn new_from(values: M, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { @@ -154,11 +146,11 @@ impl MutableListArray { /// Needs to be called when a valid value was extended to this array. /// This is a relatively low level function, prefer `try_push` when you can. pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -167,7 +159,7 @@ impl MutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), @@ -176,79 +168,30 @@ impl MutableListArray { /// Expand this array, using elements from the underlying backing array. /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. + /// this [`MutableListArray`] is currently empty. /// /// Panics if: /// - the new offsets are not in monotonic increasing order. /// - any new offset is not in bounds of the backing array. /// - the passed iterator has no upper bound. #[allow(dead_code)] - pub(crate) fn extend_offsets(&mut self, expansion: II) - where - II: TrustedLen>, - { - let current_len = self.offsets.len(); - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - if current_len == 0 && upper > 0 { - self.offsets.push(O::zero()); - } - // safety: checked below - unsafe { self.unsafe_extend_offsets(expansion) }; - if self.offsets.len() > current_len { - // check all inserted offsets - try_check_offsets(&self.offsets[current_len..], self.values.len()) - .expect("invalid offsets"); - } - // else expansion is empty, and this is trivially safe. - } - - /// Expand this array, using elements from the underlying backing array. - /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. - /// - /// # Safety - /// - /// Assumes that `offsets` are in order, and do not overrun the underlying - /// `values` backing array. - /// - /// Also assumes the expansion begins at the highest previous offset, or - /// zero if the array is currently empty. - /// - /// Panics if the passed iterator has no upper bound. - #[allow(dead_code)] - pub(crate) unsafe fn unsafe_extend_offsets(&mut self, expansion: II) + pub(crate) fn try_extend_from_lengths(&mut self, iterator: II) -> Result<()> where - II: TrustedLen>, + II: TrustedLen> + Clone, { - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - let final_size = self.len() + upper; - self.offsets.reserve(upper); - - for item in expansion { - match item { - Some(offset) => { - self.offsets.push(offset); - if let Some(validity) = &mut self.validity { - validity.push(true); - } - } - None => self.push_null(), - } - - if let Some(validity) = &mut self.validity { - if validity.capacity() < final_size { - validity.reserve(final_size - validity.capacity()); - } - } + self.offsets + .try_extend_from_lengths(iterator.clone().map(|x| x.unwrap_or_default()))?; + if let Some(validity) = &mut self.validity { + validity.extend_from_trusted_len_iter(iterator.map(|x| x.is_some())) } + assert_eq!(self.offsets.last().to_usize(), self.values.len()); + Ok(()) } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// The values @@ -257,7 +200,7 @@ impl MutableListArray { } /// The offsets - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -266,13 +209,8 @@ impl MutableListArray { &self.values } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; + let len = self.offsets.len(); let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); validity.extend_constant(len, true); @@ -320,29 +258,23 @@ impl MutableArray for MutableListArray Box { - // Safety: - // MutableListArray has monotonically increasing offsets - Box::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // MutableListArray has monotonically increasing offsets - Arc::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index bbf3846999a..09920419c21 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -1,4 +1,4 @@ -use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi, offset::OffsetsBuffer}; use super::super::{ffi::ToFfi, Array}; use super::MapArray; @@ -7,7 +7,7 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), ] } @@ -16,7 +16,7 @@ unsafe impl ToFfi for MapArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -29,7 +29,7 @@ unsafe impl ToFfi for MapArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -56,6 +56,9 @@ impl FromFfi for MapArray { let child = array.child(0)?; let values = ffi::try_from(child)?; + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index 51ca56a8459..d087a99ac3f 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -1,11 +1,11 @@ use crate::{ bitmap::Bitmap, - buffer::Buffer, datatypes::{DataType, Field}, error::Error, + offset::OffsetsBuffer, }; -use super::{new_empty_array, specification::try_check_offsets, Array}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; mod ffi; mod fmt; @@ -16,8 +16,8 @@ pub use iterator::*; #[derive(Clone)] pub struct MapArray { data_type: DataType, - // invariant: field.len() == offsets.len() - 1 - offsets: Buffer, + // invariant: field.len() == offsets.len() + offsets: OffsetsBuffer, field: Box, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, @@ -27,18 +27,17 @@ impl MapArray { /// Returns a new [`MapArray`]. /// # Errors /// This function errors iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the field' length /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`] /// * The fields' `data_type` is not equal to the inner field of `data_type` /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Result { - try_check_offsets(&offsets, field.len())?; + try_check_offsets_bounds(offsets.buffer(), field.len())?; let inner_field = Self::try_get_field(&data_type)?; if let DataType::Struct(inner) = inner_field.data_type() { @@ -60,7 +59,7 @@ impl MapArray { if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -77,13 +76,12 @@ impl MapArray { /// Creates a new [`MapArray`]. /// # Panics - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the field' length. /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`], /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Self { @@ -93,7 +91,7 @@ impl MapArray { /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Self { @@ -105,7 +103,7 @@ impl MapArray { let field = new_empty_array(Self::get_field(&data_type).data_type().clone()); Self::new( data_type, - vec![0i32; 1 + length].into(), + vec![0i32; 1 + length].try_into().unwrap(), field, Some(Bitmap::new_zeroed(length)), ) @@ -114,7 +112,7 @@ impl MapArray { /// Returns a new empty [`MapArray`]. pub fn new_empty(data_type: DataType) -> Self { let field = new_empty_array(Self::get_field(&data_type).data_type().clone()); - Self::new(data_type, Buffer::from(vec![0i32]), field, None) + Self::new(data_type, OffsetsBuffer::default(), field, None) } /// Returns this [`MapArray`] with a new validity. @@ -197,12 +195,12 @@ impl MapArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// returns the offsets #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -215,8 +213,8 @@ impl MapArray { /// Returns the element at index `i`. #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets[i]; - let offset_1 = self.offsets[i + 1]; + let offset = self.offsets.buffer()[i]; + let offset_1 = self.offsets.buffer()[i + 1]; let length = (offset_1 - offset) as usize; // Safety: @@ -230,8 +228,8 @@ impl MapArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); + let offset = *self.offsets.buffer().get_unchecked(i); + let offset_1 = *self.offsets.buffer().get_unchecked(i + 1); let length = (offset_1 - offset) as usize; self.field.slice_unchecked(offset as usize, length) diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs index 825ba01e5d0..adbf62d6c27 100644 --- a/src/array/physical_binary.rs +++ b/src/array/physical_binary.rs @@ -1,6 +1,5 @@ use crate::bitmap::MutableBitmap; -use crate::error::Error; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; /// # Safety /// The caller must ensure that `iterator` is `TrustedLen`. @@ -8,7 +7,7 @@ use crate::offset::Offset; #[allow(clippy::type_complexity)] pub(crate) unsafe fn try_trusted_len_unzip( iterator: I, -) -> std::result::Result<(Option, Vec, Vec), E> +) -> std::result::Result<(Option, Offsets, Vec), E> where O: Offset, P: AsRef<[u8]>, @@ -45,7 +44,7 @@ where ); offsets.set_len(len + 1); - Ok((null.into(), offsets, values)) + Ok((null.into(), Offsets::new_unchecked(offsets), values)) } /// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. @@ -56,7 +55,7 @@ where #[inline] pub(crate) unsafe fn trusted_len_unzip( iterator: I, -) -> (Option, Vec, Vec) +) -> (Option, Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -65,12 +64,10 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); let mut validity = MutableBitmap::new(); - offsets.push(O::default()); - extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); let validity = if validity.unset_bits() > 0 { @@ -87,7 +84,7 @@ where /// # Safety /// The caller must ensure that `iterator` is [`TrustedLen`]. #[inline] -pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) +pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -96,11 +93,9 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); - offsets.push(O::default()); - extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); (offsets, values) @@ -112,7 +107,7 @@ where // The caller must ensure the `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) where @@ -120,42 +115,13 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( P: AsRef<[u8]>, I: Iterator, { - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - - offsets.reserve(additional); - - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { let s = item.as_ref(); - - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - // Push new entries for both `values` and `offsets` buffer values.extend_from_slice(s); - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + s.len() + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } // Populates `offsets` and `values` [`Vec`]s with information extracted @@ -163,7 +129,7 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( // the return value indicates how many items were added. #[inline] pub(crate) fn extend_from_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) -> usize @@ -176,18 +142,12 @@ where offsets.reserve(size_hint); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); let start_index = offsets.len(); for item in iterator { - let s = item.as_ref(); - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - - values.extend_from_slice(s); - offsets.push(length); + let bytes = item.as_ref(); + values.extend_from_slice(bytes); + offsets.try_push_usize(bytes.len()).unwrap(); } offsets.len() - start_index } @@ -199,7 +159,7 @@ where // The caller must ensure that `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, validity: &mut MutableBitmap, iterator: I, @@ -214,51 +174,24 @@ pub(crate) unsafe fn extend_from_trusted_len_iter( offsets.reserve(additional); validity.reserve(additional); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { if let Some(item) = item { let bytes = item.as_ref(); - - // Calculate new offset value - length += O::from_usize(bytes.len()).unwrap(); - - // Push new values for `values` and `validity` buffer values.extend_from_slice(bytes); validity.push_unchecked(true); + bytes.len() } else { - // If `None`, update only `validity` validity.push_unchecked(false); + 0 } - - // Push new offset or old offset depending on the `item` - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } /// Creates two [`Vec`]s from an iterator of `&[u8]`. /// The first buffer corresponds to a offset buffer, the second to a values buffer. #[inline] -pub(crate) fn values_iter(iterator: I) -> (Vec, Vec) +pub(crate) fn values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -266,40 +199,17 @@ where { let (lower, _) = iterator.size_hint(); - let mut offsets = Vec::::with_capacity(lower + 1); + let mut offsets = Offsets::::with_capacity(lower); let mut values = Vec::::new(); - let mut length = O::default(); - offsets.push(length); - for item in iterator { let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); values.extend_from_slice(s); - - offsets.push(length) + offsets.try_push_usize(s.len()).unwrap(); } (offsets, values) } -/// Extends `offsets` with all offsets from `other` -#[inline] -pub(crate) fn try_extend_offsets(offsets: &mut Vec, other: &[O]) -> Result<(), Error> -where - O: Offset, -{ - let lengths = other.windows(2).map(|w| w[1] - w[0]); - let mut last = *offsets.last().unwrap(); - - offsets.reserve(other.len() - 1); - for length in lengths { - let r = last.checked_add(&length).ok_or(Error::Overflow)?; - last += length; - offsets.push(r) - } - Ok(()) -} - /// Extends `validity` with all items from `other` pub(crate) fn extend_validity( length: usize, diff --git a/src/array/specification.rs b/src/array/specification.rs index 7b1e0d86640..521459a4b76 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -13,23 +13,6 @@ pub fn try_check_offsets_bounds(offsets: &[O], values_len: usize) -> } } -pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usize { - assert!( - !offsets.is_empty(), - "The length of the offset buffer must be larger than 1" - ); - let len = offsets.len() - 1; - - let last_offset = offsets[len]; - let last_offset = last_offset.to_usize(); - - assert_eq!( - values_len, last_offset, - "The length of the values must be equal to the last offset value" - ); - len -} - /// # Panics iff: /// * the `offsets` is not monotonically increasing, or /// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index 2152dfade45..3611678da57 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -3,7 +3,7 @@ use crate::{ bitmap::align, error::Result, ffi, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use super::Utf8Array; @@ -12,13 +12,13 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), Some(self.values.as_ptr().cast::()), ] } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -31,7 +31,7 @@ unsafe impl ToFfi for Utf8Array { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -57,8 +57,9 @@ impl FromFfi for Utf8Array { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2)? }; - Ok(Self::from_data_unchecked( - data_type, offsets, values, validity, - )) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new_unchecked(data_type, offsets, values, validity)) } } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index f011183ce6e..330e56cd9df 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -6,7 +6,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; @@ -69,7 +69,7 @@ impl> AsRef<[u8]> for StrAsBytes { #[derive(Clone)] pub struct Utf8Array { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -80,23 +80,22 @@ impl Utf8Array { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + try_check_offsets_and_utf8(offsets.buffer(), &values)?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -145,7 +144,7 @@ impl Utf8Array { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the value of the element at index `i`, ignoring the array's validity. @@ -163,8 +162,8 @@ impl Utf8Array { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let start = self.offsets.buffer().get_unchecked(i).to_usize(); + let end = self.offsets.buffer().get_unchecked(i + 1).to_usize(); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); @@ -187,7 +186,7 @@ impl Utf8Array { /// Returns the offsets of this [`Utf8Array`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -278,7 +277,7 @@ impl Utf8Array { }), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => { // Safety: invariants are preserved @@ -326,7 +325,7 @@ impl Utf8Array { } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => Left(unsafe { Utf8Array::new_unchecked(self.data_type, self.offsets, self.values, None) @@ -349,14 +348,7 @@ impl Utf8Array { /// The array is guaranteed to have no elements nor validity. #[inline] pub fn new_empty(data_type: DataType) -> Self { - unsafe { - Self::from_data_unchecked( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) - } + unsafe { Self::from_data_unchecked(data_type, OffsetsBuffer::new(), Buffer::new(), None) } } /// Returns a new [`Utf8Array`] whose all slots are null / `None`. @@ -364,7 +356,7 @@ impl Utf8Array { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -384,25 +376,24 @@ impl Utf8Array { /// # Errors /// This function returns an error iff: /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety /// This function is unsound iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` pub unsafe fn try_new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; + try_check_offsets_bounds(offsets.buffer(), values.len())?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -426,16 +417,15 @@ impl Utf8Array { /// Creates a new [`Utf8Array`]. /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -447,7 +437,7 @@ impl Utf8Array { /// # Errors /// This function returns an error iff: /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety /// This function is unsound iff: @@ -457,7 +447,7 @@ impl Utf8Array { /// This function is `O(1)` pub unsafe fn new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -530,7 +520,7 @@ impl Utf8Array { /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -540,11 +530,10 @@ impl Utf8Array { /// Alias for [`Self::new_unchecked`] /// # Safety /// This function is unsafe iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -600,7 +589,7 @@ unsafe impl GenericBinaryArray for Utf8Array { #[inline] fn offsets(&self) -> &[O] { - self.offsets() + self.offsets().buffer() } } @@ -611,11 +600,6 @@ impl Default for Utf8Array { } else { DataType::Utf8 }; - Utf8Array::new( - data_type, - vec![O::from_usize(0).unwrap()].into(), - Default::default(), - None, - ) + Utf8Array::new(data_type, Default::default(), Default::default(), None) } } diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 4dc9b1304b8..cb66f056dd1 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -9,7 +9,7 @@ use crate::{ }, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -53,16 +53,15 @@ impl MutableUtf8Array { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -89,7 +88,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub unsafe fn new_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -105,7 +104,7 @@ impl MutableUtf8Array { /// The caller must ensure that every value between offsets is a valid utf8. pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -120,7 +119,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -231,7 +230,7 @@ impl MutableUtf8Array { } /// Extract the low-end APIs from the [`MutableUtf8Array`]. - pub fn into_data(self) -> (DataType, Vec, Vec, Option) { + pub fn into_data(self) -> (DataType, Offsets, Vec, Option) { let (data_type, offsets, values) = self.values.into_inner(); (data_type, offsets, values, self.validity) } @@ -249,7 +248,7 @@ impl MutableUtf8Array { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } } diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index 0da7ff8ff7c..354633cfff1 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -2,13 +2,13 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets_minimal, try_check_offsets_and_utf8}, + specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -20,7 +20,7 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableUtf8ValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } @@ -66,7 +66,7 @@ impl MutableUtf8ValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -75,14 +75,13 @@ impl MutableUtf8ValuesArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + /// This function is `O(N)` - checking utf8 is `O(N)` + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { + try_check_offsets_and_utf8(offsets.as_slice(), &values)?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", @@ -108,8 +107,9 @@ impl MutableUtf8ValuesArray { /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { - check_offsets_minimal(&offsets, values.len()); + pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets, values: Vec) -> Self { + try_check_offsets_bounds(offsets.as_slice(), values.len()) + .expect("The length of the values must be equal to the last offset value"); if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8") @@ -135,12 +135,9 @@ impl MutableUtf8ValuesArray { /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -153,7 +150,7 @@ impl MutableUtf8ValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -172,7 +169,7 @@ impl MutableUtf8ValuesArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -190,7 +187,7 @@ impl MutableUtf8ValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); // Safety: utf8 is validated on initialization Some(unsafe { String::from_utf8_unchecked(value) }) @@ -211,8 +208,8 @@ impl MutableUtf8ValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let start = self.offsets.as_slice().get_unchecked(i).to_usize(); + let end = self.offsets.as_slice().get_unchecked(i + 1).to_usize(); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); @@ -233,7 +230,7 @@ impl MutableUtf8ValuesArray { } /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -401,17 +398,13 @@ impl> TryPush for MutableUtf8ValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref().as_bytes(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } impl TryExtendFromSelf for MutableUtf8ValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 5b2e0e07e2d..90a2bc3a762 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -9,7 +9,7 @@ fn validity_size(validity: Option<&Bitmap>) -> usize { macro_rules! dyn_binary { ($array:expr, $ty:ty, $o:ty) => {{ let array = $array.as_any().downcast_ref::<$ty>().unwrap(); - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); // in case of Binary/Utf8/List the offsets are sliced, // not the values buffer diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index d4b63c1e73d..98cf4105b4b 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -1,6 +1,4 @@ -use std::convert::TryFrom; - -use crate::error::{Error, Result}; +use crate::error::Result; use crate::offset::Offset; use crate::{array::*, datatypes::DataType, types::NativeType}; @@ -9,11 +7,9 @@ use super::CastOptions; /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); - let offsets = from.offsets().iter().map(|x| *x as i64).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved BinaryArray::::new( to_data_type, - offsets.into(), + from.offsets().into(), values, from.validity().cloned(), ) @@ -25,13 +21,10 @@ pub fn binary_large_to_binary( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; - - let offsets = from.offsets().iter().map(|x| *x as i32).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved + let offsets = from.offsets().try_into()?; Ok(BinaryArray::::new( to_data_type, - offsets.into(), + offsets, values, from.validity().cloned(), )) @@ -58,12 +51,7 @@ pub fn binary_to_large_utf8( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = from.offsets().into(); Utf8Array::::try_new(to_data_type, offsets, values, from.validity().cloned()) } diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index eda7bda78a0..d4d47380752 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -18,7 +18,7 @@ use crate::{ array::*, datatypes::*, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, }; /// options defining how Cast kernels behave @@ -324,28 +324,18 @@ fn cast_list( } fn cast_list_to_large_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offets = array - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = array.offsets().into(); ListArray::::new( to_type.clone(), - offets, + offsets, array.values().clone(), array.validity().cloned(), ) } fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offsets = array - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); + let offsets = array.offsets().try_into().expect("Conver me to error"); ListArray::::new( to_type.clone(), @@ -366,14 +356,15 @@ fn cast_fixed_size_list_to_list( options, )?; - let offsets = (0..(fixed.len() + 1)) + let offsets = (0..=fixed.len()) .map(|ix| (ix * fixed.size()) as i32) - .collect::>() - .into(); + .collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; Ok(ListArray::::new( to_type.clone(), - offsets, + offsets.into(), new_values, fixed.validity().cloned(), )) @@ -385,7 +376,7 @@ fn cast_list_to_fixed_size_list( size: usize, options: CastOptions, ) -> Result { - let offsets = list.offsets().iter(); + let offsets = list.offsets().buffer().iter(); let expected = (0..list.len()).map(|ix| (ix * size) as i32); match offsets @@ -478,6 +469,8 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu let values = cast(array, &to.data_type, options)?; // create offsets, where if array.len() = 2, we have [0,1,2] let offsets = (0..=array.len() as i32).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 6c5c7bb8753..4feb5aaba6c 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -4,7 +4,7 @@ use num_traits::{AsPrimitive, Float, ToPrimitive}; use crate::datatypes::IntervalUnit; use crate::error::Result; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use crate::types::{days_ms, f16, months_days_ns}; use crate::{ array::*, @@ -42,7 +42,9 @@ pub fn primitive_to_binary( } values.set_len(offset); values.shrink_to_fit(); - BinaryArray::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + BinaryArray::::new( BinaryArray::::default_data_type(), offsets.into(), values.into(), @@ -104,11 +106,13 @@ pub fn primitive_to_utf8( let len = lexical_core::write_unchecked(*x, bytes).len(); offset += len; - offsets.push(O::from_usize(offset as usize).unwrap()); + offsets.push(O::from_usize(offset).unwrap()); } values.set_len(offset); values.shrink_to_fit(); - Utf8Array::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), values.into(), diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 2625497ed67..165c24a1025 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,11 +1,9 @@ -use std::convert::TryFrom; - use chrono::Datelike; use crate::{ array::*, datatypes::DataType, - error::{Error, Result}, + error::Result, offset::Offset, temporal_conversions::{ utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, @@ -150,13 +148,9 @@ pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + + let offsets = from.offsets().into(); + // Safety: sound because `values` fulfills the same invariants as `from.values()` unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) } } @@ -165,22 +159,17 @@ pub fn utf8_large_to_utf8(from: &Utf8Array) -> Result> { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; + let offsets = from.offsets().try_into()?; - let offsets = from - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + // Safety: sound because `values` fulfills the same invariants as `from.values()` Ok(unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) }) } /// Conversion to binary pub fn utf8_to_binary(from: &Utf8Array, to_data_type: DataType) -> BinaryArray { + // Safety: erasure of an invariant is always safe unsafe { - BinaryArray::::new_unchecked( + BinaryArray::::new( to_data_type, from.offsets().clone(), from.values().clone(), diff --git a/src/compute/length.rs b/src/compute/length.rs index 8cea7232894..9dc7e0b1c12 100644 --- a/src/compute/length.rs +++ b/src/compute/length.rs @@ -32,6 +32,7 @@ where { let values = array .offsets() + .buffer() .windows(2) .map(|offset| op(offset[1] - offset[0])) .collect::>(); diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 1edab3a0cb1..95618c6a007 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -21,7 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, Offsets}, }; fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { @@ -77,13 +77,10 @@ fn binary_substring( let offsets = array.offsets(); let values = array.values(); - let mut new_offsets = Vec::::with_capacity(array.len() + 1); + let mut new_offsets = Offsets::::with_capacity(array.len()); let mut new_values = Vec::::new(); // we have no way to estimate how much this will be. - let mut length_so_far = O::zero(); - new_offsets.push(length_so_far); - - offsets.windows(2).for_each(|windows| { + offsets.buffer().windows(2).for_each(|windows| { let length_i: O = windows[1] - windows[0]; // compute where we should start slicing this entry @@ -99,8 +96,9 @@ fn binary_substring( .unwrap_or(length_i) // .max(0) is not needed as it is guaranteed .min(windows[1] - start); // so we do not go beyond this entry - length_so_far += length; - new_offsets.push(length_so_far); + new_offsets + .try_push(length) + .expect("Substring is always smaller than original - overflow never happens"); // we need usize for ranges let start = start.to_usize(); diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 3656aebc771..4fc4d01138d 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -2,17 +2,22 @@ use crate::{ array::{GenericBinaryArray, PrimitiveArray}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, - offset::Offset, + offset::{Offset, Offsets, OffsetsBuffer}, }; use super::Index; -pub fn take_values(length: O, starts: &[O], offsets: &[O], values: &[u8]) -> Buffer { +pub fn take_values( + length: O, + starts: &[O], + offsets: &OffsetsBuffer, + values: &[u8], +) -> Buffer { let new_len = length.to_usize(); let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.windows(2)) + .zip(offsets.buffer().windows(2)) .for_each(|(start_, window)| { let start = start_.to_usize(); let end = (*start_ + (window[1] - window[0])).to_usize(); @@ -23,12 +28,13 @@ pub fn take_values(length: O, starts: &[O], offsets: &[O], values: &[ // take implementation when neither values nor indices contain nulls pub fn take_no_validity( - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], indices: &[I], -) -> (Buffer, Buffer, Option) { - let mut length = O::default(); +) -> (OffsetsBuffer, Buffer, Option) { + let mut length = O::zero(); let mut buffer = Vec::::new(); + let offsets = offsets.buffer(); let offsets = indices.iter().map(|index| { let index = index.to_usize(); let start = offsets[index]; @@ -40,10 +46,11 @@ pub fn take_no_validity( buffer.extend_from_slice(&values[_start..end]); length }); - let offsets = std::iter::once(O::default()) + let offsets = std::iter::once(O::zero()) .chain(offsets) - .collect::>() - .into(); + .collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); (offsets, buffer.into(), None) } @@ -52,7 +59,7 @@ pub fn take_no_validity( pub fn take_values_validity>( values: &A, indices: &[I], -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let validity_values = values.validity().unwrap(); let validity = indices .iter() @@ -75,20 +82,24 @@ pub fn take_values_validity>( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, starts.as_slice(), &offsets, values_values); - (offsets.into(), buffer, validity.into()) + (offsets, buffer, validity.into()) } // take implementation when only indices contain nulls pub fn take_indices_validity( - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); + let offsets = offsets.buffer(); + let mut starts = Vec::::with_capacity(indices.len()); let offsets = indices.values().iter().map(|index| { let index = index.to_usize(); @@ -105,18 +116,19 @@ pub fn take_indices_validity( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); - let starts: Buffer = starts.into(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values); + let buffer = take_values(length, &starts, &offsets, values); - (offsets.into(), buffer, indices.validity().cloned()) + (offsets, buffer, indices.validity().cloned()) } // take implementation when both indices and values contain nulls pub fn take_values_indices_validity>( values: &A, indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); let mut validity = MutableBitmap::with_capacity(indices.len()); @@ -148,10 +160,10 @@ pub fn take_values_indices_validity>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let starts: Buffer = starts.into(); - - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, &starts, &offsets, values_values); - (offsets.into(), buffer, validity.into()) + (offsets, buffer, validity.into()) } diff --git a/src/io/avro/read/nested.rs b/src/io/avro/read/nested.rs index 04d9bcf43b6..7886bd0b81b 100644 --- a/src/io/avro/read/nested.rs +++ b/src/io/avro/read/nested.rs @@ -2,26 +2,24 @@ use crate::array::*; use crate::bitmap::*; use crate::datatypes::*; use crate::error::*; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; /// Auxiliary struct #[derive(Debug)] pub struct DynMutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Box, validity: Option, } impl DynMutableListArray { pub fn new_from(values: Box, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { data_type, - offsets, + offsets: Offsets::::with_capacity(capacity), values, validity: None, } @@ -34,11 +32,11 @@ impl DynMutableListArray { #[inline] pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -47,20 +45,15 @@ impl DynMutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), } } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; + let len = self.offsets.len(); let mut validity = MutableBitmap::new(); validity.extend_constant(len, true); @@ -71,7 +64,7 @@ impl DynMutableListArray { impl MutableArray for DynMutableListArray { fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } fn validity(&self) -> Option<&MutableBitmap> { @@ -79,21 +72,23 @@ impl MutableArray for DynMutableListArray { } fn as_box(&mut self) -> Box { - Box::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .boxed() } fn as_arc(&mut self) -> std::sync::Arc { - std::sync::Arc::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/io/avro/write/serialize.rs b/src/io/avro/write/serialize.rs index 5310400ffb8..32dc18cc8eb 100644 --- a/src/io/avro/write/serialize.rs +++ b/src/io/avro/write/serialize.rs @@ -100,6 +100,7 @@ fn list_required<'a, O: Offset>(array: &'a ListArray, schema: &AvroSchema) -> let mut inner = new_serializer(array.values().as_ref(), schema); let lengths = array .offsets() + .buffer() .windows(2) .map(|w| (w[1] - w[0]).to_usize() as i64); @@ -125,6 +126,7 @@ fn list_optional<'a, O: Offset>(array: &'a ListArray, schema: &AvroSchema) -> let mut inner = new_serializer(array.values().as_ref(), schema); let lengths = array .offsets() + .buffer() .windows(2) .map(|w| (w[1] - w[0]).to_usize() as i64); let lengths = ZipValidity::new_with_validity(lengths, array.validity()); diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index eea120a5b2f..68c5b40d078 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -69,7 +69,7 @@ pub fn read_binary( scratch, )?; - BinaryArray::::try_new(data_type, offsets, values, validity) + BinaryArray::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_binary( diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 8824ed86fe1..1b45b10730d 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -85,7 +85,7 @@ where version, scratch, )?; - ListArray::try_new(data_type, offsets, values, validity) + ListArray::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_list( diff --git a/src/io/ipc/read/array/map.rs b/src/io/ipc/read/array/map.rs index b0803ca5f2f..4acec42ef6b 100644 --- a/src/io/ipc/read/array/map.rs +++ b/src/io/ipc/read/array/map.rs @@ -80,7 +80,7 @@ pub fn read_map( version, scratch, )?; - MapArray::try_new(data_type, offsets, field, validity) + MapArray::try_new(data_type, offsets.try_into()?, field, validity) } pub fn skip_map( diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 1ff056d6f8c..398184e3e55 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -70,7 +70,7 @@ pub fn read_utf8( scratch, )?; - Utf8Array::::try_new(data_type, offsets, values, validity) + Utf8Array::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_utf8( diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 7737cbca9cd..03e716ce83a 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -2,7 +2,11 @@ use arrow_format::ipc; use crate::{ - array::*, bitmap::Bitmap, datatypes::PhysicalType, offset::Offset, trusted_len::TrustedLen, + array::*, + bitmap::Bitmap, + datatypes::PhysicalType, + offset::{Offset, OffsetsBuffer}, + trusted_len::TrustedLen, types::NativeType, }; @@ -66,7 +70,7 @@ fn write_boolean( #[allow(clippy::too_many_arguments)] fn write_generic_binary( validity: Option<&Bitmap>, - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], buffers: &mut Vec, arrow_data: &mut Vec, @@ -74,6 +78,7 @@ fn write_generic_binary( is_little_endian: bool, compression: Option, ) { + let offsets = offsets.buffer(); write_bitmap( validity, offsets.len() - 1, @@ -182,7 +187,7 @@ fn write_list( is_little_endian: bool, compression: Option, ) { - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); let validity = array.validity(); write_bitmap( @@ -196,7 +201,7 @@ fn write_list( let first = *offsets.first().unwrap(); let last = *offsets.last().unwrap(); - if first == O::default() { + if first == O::zero() { write_buffer( offsets, buffers, @@ -310,7 +315,7 @@ fn write_map( is_little_endian: bool, compression: Option, ) { - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); let validity = array.validity(); write_bitmap( diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index 1d3997b1da1..73eac81d4f9 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -12,7 +12,7 @@ use crate::{ chunk::Chunk, datatypes::{DataType, Field, IntervalUnit, Schema}, error::Error, - offset::Offset, + offset::{Offset, Offsets}, types::{f16, NativeType}, }; @@ -227,24 +227,19 @@ fn deserialize_list<'a, O: Offset, A: Borrow>>( let child = ListArray::::get_child_type(&data_type); let mut validity = MutableBitmap::with_capacity(rows.len()); - let mut offsets = Vec::::with_capacity(rows.len() + 1); + let mut offsets = Offsets::::with_capacity(rows.len()); let mut inner = vec![]; - offsets.push(O::zero()); - rows.iter().fold(O::zero(), |mut length, row| { - match row.borrow() { - Value::Array(value) => { - inner.extend(value.iter()); - validity.push(true); - // todo make this an Err - length += O::from_usize(value.len()).expect("List offset is too large :/"); - offsets.push(length); - length - } - _ => { - validity.push(false); - offsets.push(length); - length - } + rows.iter().for_each(|row| match row.borrow() { + Value::Array(value) => { + inner.extend(value.iter()); + validity.push(true); + offsets + .try_push_usize(value.len()) + .expect("List offset is too large :/"); + } + _ => { + validity.push(false); + offsets.extend_constant(1); } }); @@ -259,39 +254,25 @@ fn deserialize_list_into<'a, O: Offset, A: Borrow>>( target: &mut MutableListArray>, rows: &[A], ) { - let start = { - let empty = vec![]; - let inner: Vec<_> = rows - .iter() - .flat_map(|row| match row.borrow() { - Value::Array(value) => value.iter(), - _ => empty.iter(), - }) - .collect(); - - let child = target.mut_values(); - let start_len = child.len(); - deserialize_into(child, &inner); + let empty = vec![]; + let inner: Vec<_> = rows + .iter() + .flat_map(|row| match row.borrow() { + Value::Array(value) => value.iter(), + _ => empty.iter(), + }) + .collect(); - // todo make this an Err - O::from_usize(start_len).expect("Child list size too large") - }; + deserialize_into(target.mut_values(), &inner); - let mut position = start; - let arrays = rows.iter().map(|row| { - match row.borrow() { - Value::Array(value) => { - // todo make this an Err - position += O::from_usize(value.len()).expect("List offset is too large :/"); - Some(position) - } - _ => None, - } + let lengths = rows.iter().map(|row| match row.borrow() { + Value::Array(value) => Some(value.len()), + _ => None, }); - // though this will always be safe, we cannot use unsafe_extend_offsets here - // due to `#![forbid(unsafe_code)]` on the io module - target.extend_offsets(arrays); + target + .try_extend_from_lengths(lengths) + .expect("Offsets overflow"); } fn deserialize_fixed_size_list_into<'a, A: Borrow>>( @@ -302,10 +283,7 @@ fn deserialize_fixed_size_list_into<'a, A: Borrow>>( match row.borrow() { Value::Array(value) => { if value.len() == target.size() { - { - let child = target.mut_values(); - deserialize_into(child, value); - } + deserialize_into(target.mut_values(), value); // unless alignment is already off, the if above should // prevent this from ever happening. target.try_push_valid().expect("unaligned backing array"); diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index 9bf08fc8968..ec1b0d5ee49 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -141,7 +141,7 @@ fn list_serializer<'a, O: Offset>( let mut serializer = new_serializer(array.values().as_ref()); Box::new(BufStreamingIterator::new( - ZipValidity::new_with_validity(array.offsets().windows(2), array.validity()), + ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity()), move |offset, buf| { if let Some(offset) = offset { let length = (offset[1] - offset[0]).to_usize(); diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 42fe220b96f..67af89a18b2 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -190,7 +190,7 @@ fn to_binary(json_col: &ArrowJsonColumn, data_type: DataType) -> Box< .iter() .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) .collect(); - Box::new(BinaryArray::new(data_type, offsets, values, validity)) + BinaryArray::new(data_type, offsets.try_into().unwrap(), values, validity).boxed() } fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box { @@ -203,7 +203,7 @@ fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box( @@ -223,9 +223,7 @@ fn to_list( dictionaries, )?; let offsets = to_offsets::(json_col.offset.as_ref()); - Ok(Box::new(ListArray::::new( - data_type, offsets, values, validity, - ))) + Ok(ListArray::::new(data_type, offsets.try_into()?, values, validity).boxed()) } fn to_map( @@ -245,7 +243,12 @@ fn to_map( dictionaries, )?; let offsets = to_offsets::(json_col.offset.as_ref()); - Ok(Box::new(MapArray::new(data_type, offsets, field, validity))) + Ok(Box::new(MapArray::new( + data_type, + offsets.try_into().unwrap(), + field, + validity, + ))) } fn to_dictionary( diff --git a/src/io/odbc/read/deserialize.rs b/src/io/odbc/read/deserialize.rs index b98596850c2..7ebf79b8b9a 100644 --- a/src/io/odbc/read/deserialize.rs +++ b/src/io/odbc/read/deserialize.rs @@ -6,6 +6,7 @@ use crate::array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; use crate::buffer::Buffer; use crate::datatypes::{DataType, TimeUnit}; +use crate::offset::{Offsets, OffsetsBuffer}; use crate::types::NativeType; use super::super::api::buffers::AnyColumnView; @@ -118,22 +119,23 @@ fn bool_optional(data_type: DataType, values: &[Bit], indicators: &[isize]) -> B fn binary_generic<'a>( iter: impl Iterator>, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let length = iter.size_hint().0; let mut validity = MutableBitmap::with_capacity(length); let mut values = Vec::::with_capacity(0); - let mut offsets = Vec::with_capacity(length + 1); - offsets.push(0i32); - + let mut offsets = Offsets::::with_capacity(length); for item in iter { if let Some(item) = item { values.extend_from_slice(item); + offsets + .try_push_usize(item.len()) + .expect("List to contain less than i32::MAX items."); validity.push(true); } else { + offsets.extend_constant(1); validity.push(false); } - offsets.push(values.len() as i32) } (offsets.into(), values.into(), validity.into()) diff --git a/src/io/odbc/write/serialize.rs b/src/io/odbc/write/serialize.rs index 7f2fc18aa7c..f92326ba89c 100644 --- a/src/io/odbc/write/serialize.rs +++ b/src/io/odbc/write/serialize.rs @@ -160,6 +160,7 @@ fn fixed_binary(array: &FixedSizeBinaryArray, writer: &mut BinColumnWriter) { fn binary(array: &BinaryArray, writer: &mut BinColumnWriter) { let max_len = array .offsets() + .buffer() .windows(2) .map(|x| (x[1] - x[0]).to_usize()) .max() @@ -171,6 +172,7 @@ fn binary(array: &BinaryArray, writer: &mut BinColumnWriter) { fn utf8(array: &Utf8Array, writer: &mut TextColumnWriter) { let max_len = array .offsets() + .buffer() .windows(2) .map(|x| (x[1] - x[0]).to_usize()) .max() diff --git a/src/io/orc/read/mod.rs b/src/io/orc/read/mod.rs index 4a365078236..3fe4abb7f63 100644 --- a/src/io/orc/read/mod.rs +++ b/src/io/orc/read/mod.rs @@ -5,7 +5,7 @@ use crate::array::{Array, BinaryArray, BooleanArray, Int64Array, PrimitiveArray, use crate::bitmap::{Bitmap, MutableBitmap}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::Error; -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use crate::types::NativeType; use orc_format::proto::stream::Kind; @@ -250,23 +250,19 @@ where #[inline] fn try_extend, I: Iterator>( - offsets: &mut Vec, - length: &mut O, + offsets: &mut Offsets, iter: I, -) -> Result<(), orc_format::error::Error> { +) -> Result<(), Error> { for item in iter { - let item: O = item - .try_into() - .map_err(|_| orc_format::error::Error::OutOfSpec)?; - *length += item; - offsets.push(*length) + let length: O = item.try_into().map_err(|_| Error::Overflow)?; + offsets.try_push(length)? } Ok(()) } fn deserialize_binary_generic>( column: &Column, -) -> Result<(Vec, Vec, Option), Error> { +) -> Result<(Offsets, Vec, Option), Error> { let num_rows = column.number_of_rows(); let mut scratch = vec![]; @@ -274,9 +270,7 @@ fn deserialize_binary_generic>( let lengths = column.get_stream(Kind::Length, scratch)?; - let mut offsets = Vec::with_capacity(num_rows + 1); - let mut length = O::default(); - offsets.push(length); + let mut offsets = Offsets::with_capacity(num_rows); if let Some(validity) = &validity { let mut iter = decode::UnsignedRleV2Iter::new(lengths, validity.len() - validity.unset_bits(), vec![]); @@ -286,34 +280,35 @@ fn deserialize_binary_generic>( .next() .transpose()? .ok_or(orc_format::error::Error::OutOfSpec)?; - let item: O = item + let length: O = item .try_into() .map_err(|_| Error::ExternalFormat("value uncastable".to_string()))?; - length += item; + offsets.try_push(length)?; + } else { + offsets.extend_constant(1) } - offsets.push(length); } let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } else { let mut iter = decode::UnsignedRleV2RunIter::new(lengths, num_rows, vec![]); iter.try_for_each(|run| { - run.and_then(|run| match run { + run.map_err(Error::from).and_then(|run| match run { decode::UnsignedRleV2Run::Direct(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::Delta(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::ShortRepeat(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } }) })?; let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } - let length = length.to_usize(); + let length = offsets.last().to_usize(); let mut values = vec![0; length]; let mut data = column.get_stream(Kind::Data, scratch)?; diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 0a2aa098c45..14c805158f8 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -14,7 +14,7 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, - offset::Offset, + offset::{Offset, OffsetsBuffer}, }; use super::super::utils::{ @@ -228,7 +228,7 @@ impl<'a> utils::PageState<'a> for State<'a> { pub trait TraitBinaryArray: Array + 'static { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result @@ -239,7 +239,7 @@ pub trait TraitBinaryArray: Array + 'static { impl TraitBinaryArray for BinaryArray { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -250,7 +250,7 @@ impl TraitBinaryArray for BinaryArray { impl TraitBinaryArray for Utf8Array { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -373,22 +373,18 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { let Binary { offsets, values: values_, - last_offset, } = values; - let offset = *last_offset; + let last_offset = *offsets.last(); extend_from_decoder( validity, page_validity, Some(additional), offsets, - page_values.lengths.by_ref().map(|x| { - *last_offset += O::from_usize(x).unwrap(); - *last_offset - }), + page_values.lengths.by_ref(), ); - let length = *last_offset - offset; + let length = *offsets.last() - last_offset; let (consumed, remaining) = page_values.values.split_at(length.to_usize()); page_values.values = remaining; @@ -486,7 +482,7 @@ pub(super) fn finish>( ) -> Result { A::try_new( data_type.clone(), - values.offsets.0.into(), + values.offsets.into(), values.values.into(), validity.into(), ) diff --git a/src/io/parquet/read/deserialize/binary/dictionary.rs b/src/io/parquet/read/deserialize/binary/dictionary.rs index 5cf3c07d97b..6f883528ef8 100644 --- a/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/binary/dictionary.rs @@ -67,11 +67,10 @@ fn read_dict(data_type: DataType, dict: &DictPage) -> Box match data_type.to_physical_type() { PhysicalType::Utf8 | PhysicalType::LargeUtf8 => { - Utf8Array::::new(data_type, data.offsets.0.into(), data.values.into(), None).boxed() + Utf8Array::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } PhysicalType::Binary | PhysicalType::LargeBinary => { - BinaryArray::::new(data_type, data.offsets.0.into(), data.values.into(), None) - .boxed() + BinaryArray::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } _ => unreachable!(), } diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index d886c1bfae6..ddf7abc2a06 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -1,4 +1,4 @@ -use crate::offset::Offset; +use crate::offset::{Offset, Offsets}; use super::super::utils::Pushable; @@ -7,70 +7,51 @@ use super::super::utils::Pushable; pub struct Binary { pub offsets: Offsets, pub values: Vec, - pub last_offset: O, } -#[derive(Debug)] -pub struct Offsets(pub Vec); - -impl Offsets { - #[inline] - pub fn extend_lengths>(&mut self, lengths: I) { - let mut last_offset = *self.0.last().unwrap(); - self.0.extend(lengths.map(|length| { - last_offset += O::from_usize(length).unwrap(); - last_offset - })); - } -} - -impl Pushable for Offsets { +impl Pushable for Offsets { fn reserve(&mut self, additional: usize) { - self.0.reserve(additional) + self.reserve(additional) } #[inline] fn len(&self) -> usize { - self.0.len() - 1 + self.len() } #[inline] - fn push(&mut self, value: O) { - self.0.push(value) + fn push(&mut self, value: usize) { + self.try_push_usize(value).unwrap() } #[inline] fn push_null(&mut self) { - self.0.push(*self.0.last().unwrap()) + self.extend_constant(1); } #[inline] - fn extend_constant(&mut self, additional: usize, value: O) { - self.0.extend_constant(additional, value) + fn extend_constant(&mut self, additional: usize, _: usize) { + self.extend_constant(additional) } } impl Binary { #[inline] pub fn with_capacity(capacity: usize) -> Self { - let mut offsets = Vec::with_capacity(1 + capacity); - offsets.push(O::default()); Self { - offsets: Offsets(offsets), + offsets: Offsets::with_capacity(capacity), values: Vec::with_capacity(capacity * 24), - last_offset: O::default(), } } #[inline] pub fn push(&mut self, v: &[u8]) { self.values.extend(v); - self.last_offset += O::from_usize(v.len()).unwrap(); - self.offsets.push(self.last_offset) + self.offsets.try_push_usize(v.len()).unwrap() } #[inline] pub fn extend_constant(&mut self, additional: usize) { - self.offsets.extend_constant(additional, self.last_offset); + self.offsets.extend_constant(additional); } #[inline] @@ -80,10 +61,10 @@ impl Binary { #[inline] pub fn extend_lengths>(&mut self, lengths: I, values: &mut &[u8]) { - let current_offset = self.last_offset; - self.offsets.extend_lengths(lengths); - self.last_offset = *self.offsets.0.last().unwrap(); // guaranteed to have one - let length = self.last_offset.to_usize() - current_offset.to_usize(); + let current_offset = *self.offsets.last(); + self.offsets.try_extend_from_lengths(lengths).unwrap(); + let new_offset = *self.offsets.last(); + let length = new_offset.to_usize() - current_offset.to_usize(); let (consumed, remaining) = values.split_at(length); *values = remaining; self.values.extend_from_slice(consumed); @@ -93,7 +74,7 @@ impl Binary { impl<'a, O: Offset> Pushable<&'a [u8]> for Binary { #[inline] fn reserve(&mut self, additional: usize) { - let avg_len = self.values.len() / std::cmp::max(self.last_offset.to_usize(), 1); + let avg_len = self.values.len() / std::cmp::max(self.offsets.last().to_usize(), 1); self.values.reserve(additional * avg_len); self.offsets.reserve(additional); } diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index b16d1a6e83d..d3baa7879be 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -18,6 +18,7 @@ use crate::{ array::{Array, DictionaryKey, FixedSizeListArray, ListArray}, datatypes::{DataType, Field, IntervalUnit}, error::Result, + offset::Offsets, }; use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; @@ -53,6 +54,11 @@ fn create_list( offsets.push(values.len() as i64); let offsets = offsets.iter().map(|x| *x as i32).collect::>(); + + let offsets: Offsets = offsets + .try_into() + .expect("i64 offsets do not fit in i32 offsets"); + Box::new(ListArray::::new( data_type, offsets.into(), @@ -65,7 +71,7 @@ fn create_list( Box::new(ListArray::::new( data_type, - offsets.into(), + offsets.try_into().expect("List too large"), values, validity.and_then(|x| x.into()), )) diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs index 389d0479fec..065190fbb7e 100644 --- a/src/io/parquet/read/deserialize/nested.rs +++ b/src/io/parquet/read/deserialize/nested.rs @@ -295,7 +295,7 @@ where let (nested, inner) = x?; let array = MapArray::new( field.data_type().clone(), - vec![0, inner.len() as i32].into(), + vec![0, inner.len() as i32].try_into().unwrap(), inner, None, ); diff --git a/src/io/parquet/read/statistics/list.rs b/src/io/parquet/read/statistics/list.rs index 9d0adbcb0cc..047b5e07700 100644 --- a/src/io/parquet/read/statistics/list.rs +++ b/src/io/parquet/read/statistics/list.rs @@ -1,6 +1,7 @@ use crate::array::*; use crate::datatypes::DataType; use crate::error::Result; +use crate::offset::Offsets; use super::make_mutable; @@ -40,19 +41,21 @@ impl MutableArray for DynMutableListArray { match self.data_type.to_logical_type() { DataType::List(_) => { - let offsets = (0..=inner.len() as i32).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) } DataType::LargeList(_) => { - let offsets = (0..=inner.len() as i64).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) diff --git a/src/io/parquet/read/statistics/map.rs b/src/io/parquet/read/statistics/map.rs index c31dfb9d9e7..db3678510f8 100644 --- a/src/io/parquet/read/statistics/map.rs +++ b/src/io/parquet/read/statistics/map.rs @@ -40,7 +40,7 @@ impl MutableArray for DynMutableMapArray { fn as_box(&mut self) -> Box { Box::new(MapArray::new( self.data_type.clone(), - vec![0, self.inner.len() as i32].into(), + vec![0, self.inner.len() as i32].try_into().unwrap(), self.inner.as_box(), None, )) diff --git a/src/io/parquet/write/binary/basic.rs b/src/io/parquet/write/binary/basic.rs index f2a5071c14d..55886d0f630 100644 --- a/src/io/parquet/write/binary/basic.rs +++ b/src/io/parquet/write/binary/basic.rs @@ -64,7 +64,7 @@ pub fn array_to_page( Encoding::Plain => encode_plain(array, is_optional, &mut buffer), Encoding::DeltaLengthByteArray => encode_delta( array.values(), - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, &mut buffer, diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index 2259647d79c..e3e1eec410f 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -106,7 +106,7 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::List(ListNested::new( - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, ))); @@ -129,7 +129,7 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::LargeList(ListNested::new( - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, ))); @@ -418,7 +418,7 @@ mod tests { let array = ListArray::new( DataType::List(Box::new(Field::new("l", array.data_type().clone(), true))), - vec![0i32, 2, 4].into(), + vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, ); diff --git a/src/io/parquet/write/utf8/basic.rs b/src/io/parquet/write/utf8/basic.rs index 1f1aeaab8fd..744d3bce7b1 100644 --- a/src/io/parquet/write/utf8/basic.rs +++ b/src/io/parquet/write/utf8/basic.rs @@ -63,7 +63,7 @@ pub fn array_to_page( Encoding::Plain => encode_plain(array, is_optional, &mut buffer), Encoding::DeltaLengthByteArray => encode_delta( array.values(), - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, &mut buffer, diff --git a/src/offset.rs b/src/offset.rs index edca7dc8b38..717214e7d9f 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -1,2 +1,447 @@ //! Contains the declaration of [`Offset`] +use std::hint::unreachable_unchecked; + +use crate::buffer::Buffer; +use crate::error::Error; pub use crate::types::Offset; + +/// A wrapper type of [`Vec`] representing the invariants of Arrow's offsets. +/// It is guaranteed to (sound to assume that): +/// * every element is `>= 0` +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Offsets(Vec); + +impl Default for Offsets { + #[inline] + fn default() -> Self { + Self::new() + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Buffer) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets.into())) + } +} + +impl From> for OffsetsBuffer { + #[inline] + fn from(offsets: Offsets) -> Self { + Self(offsets.0.into()) + } +} + +impl Offsets { + /// Returns an empty [`Offsets`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()]) + } + + /// Creates a new [`Offsets`] from an iterator of lengths + #[inline] + pub fn try_from_iter>(iter: I) -> Result { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + let mut offsets = Self::with_capacity(lower); + for item in iterator { + offsets.try_push_usize(item)? + } + Ok(offsets) + } + + /// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries. + pub fn with_capacity(capacity: usize) -> Self { + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(O::zero()); + Self(offsets) + } + + /// Returns the capacity of [`Offsets`]. + pub fn capacity(&self) -> usize { + self.0.capacity() - 1 + } + + /// Reserves `additional` entries. + pub fn reserve(&mut self, additional: usize) { + self.0.reserve(additional); + } + + /// Shrinks the capacity of self to fit. + pub fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit(); + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Panic + /// This function asserts that `length > 0`. + #[inline] + pub fn try_push(&mut self, length: O) -> Result<(), Error> { + let old_length = self.last(); + assert!(length >= O::zero()); + let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + self.0.push(new_length); + Ok(()) + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Implementation + /// This function: + /// * checks that this length does not overflow + #[inline] + pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> { + let length = O::from_usize(length).ok_or(Error::Overflow)?; + + let old_length = self.last(); + let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + self.0.push(new_length); + Ok(()) + } + + /// Returns [`Offsets`] assuming that `offsets` fulfills its invariants + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Vec) -> Self { + Self(offsets) + } + + /// Returns the last offset of this container. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns the length of this container + #[inline] + pub fn len(&self) -> usize { + self.0.len() - 1 + } + + /// Returns the byte slice stored in this buffer + #[inline] + pub fn as_slice(&self) -> &[O] { + self.0.as_slice() + } + + /// Pops the last element + #[inline] + pub fn pop(&mut self) -> Option { + if self.len() == 0 { + None + } else { + self.0.pop() + } + } + + /// Extends itself with `additional` elements equal to the last offset. + /// This is useful to extend offsets with empty values, e.g. for null slots. + #[inline] + pub fn extend_constant(&mut self, additional: usize) { + let offset = *self.last(); + if additional == 1 { + self.0.push(offset) + } else { + self.0.resize(self.len() + additional, offset) + } + } + + /// Try to create a new [`Offsets`] from a sequence of `lengths` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_from_lengths>(lengths: I) -> Result { + let mut self_ = Self::with_capacity(lengths.size_hint().0); + self_.try_extend_from_lengths(lengths)?; + Ok(self_) + } + + /// Try extend from an iterator of lengths + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_extend_from_lengths>( + &mut self, + lengths: I, + ) -> Result<(), Error> { + let mut total_length = 0; + let mut offset = *self.last(); + let original_offset = offset.to_usize(); + + let lengths = lengths.map(|length| { + total_length += length; + O::from_as_usize(length) + }); + + let offsets = lengths.map(|length| { + offset += length; // this may overflow, checked below + offset + }); + self.0.extend(offsets); + + let last_offset = original_offset + .checked_add(total_length) + .ok_or(Error::Overflow)?; + O::from_usize(last_offset).ok_or(Error::Overflow)?; + Ok(()) + } + + /// Extends itself from another [`Offsets`] + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_self(&mut self, other: &Self) -> Result<(), Error> { + let mut length = *self.last(); + let other_length = *other.last(); + // check if the operation would overflow + length.checked_add(&other_length).ok_or(Error::Overflow)?; + + let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Extends itself from another [`Offsets`] sliced by `start, length` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_slice( + &mut self, + other: &OffsetsBuffer, + start: usize, + length: usize, + ) -> Result<(), Error> { + if length == 0 { + return Ok(()); + } + let other = &other.0[start..start + length + 1]; + let other_length = other.last().expect("Length to be non-zero"); + let mut length = *self.last(); + // check if the operation would overflow + length.checked_add(other_length).ok_or(Error::Overflow)?; + + let lengths = other.windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Returns the inner [`Vec`]. + #[inline] + pub fn into_inner(self) -> Vec { + self.0 + } +} + +/// Checks that `offsets` is monotonically increasing. +fn try_check_offsets(offsets: &[O]) -> Result<(), Error> { + // this code is carefully constructed to auto-vectorize, don't change naively! + match offsets.first() { + None => Err(Error::oos("offsets must have at least one element")), + Some(first) => { + if *first < O::zero() { + return Err(Error::oos("offsets must be larger than 0")); + } + let mut previous = *first; + let mut any_invalid = false; + + // This loop will auto-vectorize because there is not any break, + // an invalid value will be returned once the whole offsets buffer is processed. + for offset in offsets { + if previous > *offset { + any_invalid = true + } + previous = *offset; + } + + if any_invalid { + Err(Error::oos("offsets must be monotonically increasing")) + } else { + Ok(()) + } + } + } +} + +/// A wrapper type of [`Buffer`] that is guaranteed to: +/// * Always contain an element +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Clone, PartialEq, Debug)] +pub struct OffsetsBuffer(Buffer); + +impl Default for OffsetsBuffer { + #[inline] + fn default() -> Self { + Self(vec![O::zero()].into()) + } +} + +impl OffsetsBuffer { + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Buffer) -> Self { + Self(offsets) + } + + /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()].into()) + } + + /// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`]. + #[inline] + pub fn get_mut(&mut self) -> Option> { + self.0 + .get_mut() + .map(|x| { + let mut new = vec![O::zero()]; + std::mem::swap(x, &mut new); + new + }) + // Safety: Offsets and OffsetsBuffer share invariants + .map(|offsets| unsafe { Offsets::new_unchecked(offsets) }) + } + + /// Returns a reference to its internal [`Buffer`]. + #[inline] + pub fn buffer(&self) -> &Buffer { + &self.0 + } + + /// Returns the length of this container + #[inline] + pub fn len(&self) -> usize { + self.0.len() - 1 + } + + /// Returns the byte slice stored in this buffer + #[inline] + pub fn as_slice(&self) -> &[O] { + self.0.as_slice() + } + + /// Returns the last offset of this container, which is guaranteed to exist. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns a new [`OffsetsBuffer`] that is a slice of this buffer starting at `offset`. + /// Doing so allows the same memory region to be shared between buffers. + /// # Safety + /// The caller must ensure `offset + length <= self.len()` + #[inline] + pub unsafe fn slice_unchecked(self, offset: usize, length: usize) -> Self { + Self(self.0.slice_unchecked(offset, length)) + } + + /// Returns the inner [`Buffer`]. + #[inline] + pub fn into_inner(self) -> Buffer { + self.0 + } +} + +impl From<&OffsetsBuffer> for OffsetsBuffer { + fn from(offsets: &OffsetsBuffer) -> Self { + // this conversion is lossless and uphelds all invariants + Self( + offsets + .buffer() + .iter() + .map(|x| *x as i64) + .collect::>() + .into(), + ) + } +} + +impl TryFrom<&OffsetsBuffer> for OffsetsBuffer { + type Error = Error; + + fn try_from(offsets: &OffsetsBuffer) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self( + offsets + .buffer() + .iter() + .map(|x| *x as i32) + .collect::>() + .into(), + )) + } +} + +impl From> for Offsets { + fn from(offsets: Offsets) -> Self { + // this conversion is lossless and uphelds all invariants + Self( + offsets + .as_slice() + .iter() + .map(|x| *x as i64) + .collect::>(), + ) + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + fn try_from(offsets: Offsets) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self( + offsets + .as_slice() + .iter() + .map(|x| *x as i32) + .collect::>(), + )) + } +} diff --git a/src/types/index.rs b/src/types/index.rs index b44b3957e79..264720fbe7a 100644 --- a/src/types/index.rs +++ b/src/types/index.rs @@ -21,6 +21,9 @@ pub trait Index: /// Convert itself from [`usize`]. fn from_usize(index: usize) -> Option; + /// Convert itself from [`usize`]. + fn from_as_usize(index: usize) -> Self; + /// An iterator from (inclusive) `start` to (exclusive) `end`. fn range(start: usize, end: usize) -> Option> { let start = Self::from_usize(start); @@ -44,6 +47,11 @@ macro_rules! index { fn from_usize(value: usize) -> Option { Self::try_from(value).ok() } + + #[inline] + fn from_as_usize(value: usize) -> Self { + value as $t + } } }; } diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index 03cee86be1b..7609058afa5 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -3,6 +3,7 @@ use arrow2::{ bitmap::Bitmap, buffer::Buffer, datatypes::DataType, + offset::OffsetsBuffer, }; mod mutable; @@ -98,7 +99,7 @@ fn with_validity() { #[test] #[should_panic] fn wrong_offsets() { - let offsets = Buffer::from(vec![0, 5, 4]); // invalid offsets + let offsets = vec![0, 5, 4].try_into().unwrap(); // invalid offsets let values = Buffer::from(b"abbbbb".to_vec()); BinaryArray::::from_data(DataType::Binary, offsets, values, None); } @@ -106,7 +107,7 @@ fn wrong_offsets() { #[test] #[should_panic] fn wrong_data_type() { - let offsets = Buffer::from(vec![0, 4]); + let offsets = vec![0, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); BinaryArray::::from_data(DataType::Int8, offsets, values, None); } @@ -114,7 +115,7 @@ fn wrong_data_type() { #[test] #[should_panic] fn value_with_wrong_offsets_panics() { - let offsets = Buffer::from(vec![0, 10, 11, 4]); + let offsets = vec![0, 10, 11, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); // the 10-11 is not checked let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); @@ -127,7 +128,7 @@ fn value_with_wrong_offsets_panics() { #[test] #[should_panic] fn index_out_of_bounds_panics() { - let offsets = Buffer::from(vec![0, 1, 2, 4]); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); let array = BinaryArray::::from_data(DataType::Utf8, offsets, values, None); @@ -137,7 +138,7 @@ fn index_out_of_bounds_panics() { #[test] #[should_panic] fn value_unchecked_with_wrong_offsets_panics() { - let offsets = Buffer::from(vec![0, 10, 11, 4]); + let offsets = vec![0, 10, 11, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); // the 10-11 is not checked let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); @@ -157,7 +158,7 @@ fn debug() { #[test] fn into_mut_1() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = values.clone(); // cloned values assert_eq!(a, values); @@ -167,7 +168,7 @@ fn into_mut_1() { #[test] fn into_mut_2() { - let offsets = Buffer::from(vec![0, 1]); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = offsets.clone(); // cloned offsets assert_eq!(a, offsets); @@ -177,7 +178,7 @@ fn into_mut_2() { #[test] fn into_mut_3() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let validity = Some([true].into()); let a = validity.clone(); // cloned validity @@ -188,7 +189,7 @@ fn into_mut_3() { #[test] fn into_mut_4() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let validity = Some([true].into()); let array = BinaryArray::::new(DataType::Binary, offsets, values, validity); diff --git a/tests/it/array/binary/mutable.rs b/tests/it/array/binary/mutable.rs index 8968d1cb15c..012e79d5e4a 100644 --- a/tests/it/array/binary/mutable.rs +++ b/tests/it/array/binary/mutable.rs @@ -10,12 +10,12 @@ fn new() { let a = MutableBinaryArray::::with_capacity(2); assert_eq!(a.len(), 0); - assert!(a.offsets().capacity() >= 3); + assert!(a.offsets().capacity() >= 2); assert_eq!(a.values().capacity(), 0); let a = MutableBinaryArray::::with_capacities(2, 60); assert_eq!(a.len(), 0); - assert!(a.offsets().capacity() >= 3); + assert!(a.offsets().capacity() >= 2); assert!(a.values().capacity() >= 60); } @@ -24,12 +24,12 @@ fn from_iter() { let iter = (0..3u8).map(|x| Some(vec![x; x as usize])); let a: MutableBinaryArray = iter.clone().collect(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = unsafe { MutableBinaryArray::::from_trusted_len_iter_unchecked(iter) }; assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); } @@ -38,12 +38,12 @@ fn from_trusted_len_iter() { let data = vec![vec![0; 0], vec![1; 1], vec![2; 2]]; let a: MutableBinaryArray = data.iter().cloned().map(Some).collect(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::from_trusted_len_iter(data.iter().cloned().map(Some)); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::try_from_trusted_len_iter::( @@ -51,12 +51,12 @@ fn from_trusted_len_iter() { ) .unwrap(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::from_trusted_len_values_iter(data.iter().cloned()); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); } diff --git a/tests/it/array/binary/mutable_values.rs b/tests/it/array/binary/mutable_values.rs index af02b1d54b3..0bf532bc21c 100644 --- a/tests/it/array/binary/mutable_values.rs +++ b/tests/it/array/binary/mutable_values.rs @@ -7,35 +7,28 @@ fn capacity() { let mut b = MutableBinaryValuesArray::::with_capacity(100); assert_eq!(b.values().capacity(), 0); - assert!(b.offsets().capacity() >= 101); + assert!(b.offsets().capacity() >= 100); b.shrink_to_fit(); - assert!(b.offsets().capacity() < 101); -} - -#[test] -fn offsets_must_be_monotonic_increasing() { - let offsets = vec![0, 5, 4]; - let values = b"abbbbb".to_vec(); - assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); + assert!(b.offsets().capacity() < 100); } #[test] fn offsets_must_be_in_bounds() { - let offsets = vec![0, 10]; + let offsets = vec![0, 10].try_into().unwrap(); let values = b"abbbbb".to_vec(); assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); } #[test] fn data_type_must_be_consistent() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = b"abbb".to_vec(); assert!(MutableBinaryValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); } #[test] fn as_box() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -44,7 +37,7 @@ fn as_box() { #[test] fn as_arc() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -53,13 +46,13 @@ fn as_arc() { #[test] fn extend_trusted_len() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); - let offsets = vec![0, 2, 3, 4]; + let offsets = vec![0, 2, 3, 4].try_into().unwrap(); let values = b"abab".to_vec(); assert_eq!( b.as_box(), @@ -73,7 +66,7 @@ fn extend_trusted_len() { fn from_trusted_len() { let mut b = MutableBinaryValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); - let offsets = vec![0, 1, 2]; + let offsets = vec![0, 1, 2].try_into().unwrap(); let values = b"ab".to_vec(); assert_eq!( b.as_box(), @@ -85,7 +78,7 @@ fn from_trusted_len() { #[test] fn extend_from_iter() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -94,7 +87,7 @@ fn extend_from_iter() { let a = b.clone(); b.extend_trusted_len(a.iter()); - let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let offsets = vec![0, 2, 3, 4, 6, 7, 8].try_into().unwrap(); let values = b"abababab".to_vec(); assert_eq!( b.as_box(), diff --git a/tests/it/array/binary/to_mutable.rs b/tests/it/array/binary/to_mutable.rs index b553d85d050..1773c83a362 100644 --- a/tests/it/array/binary/to_mutable.rs +++ b/tests/it/array/binary/to_mutable.rs @@ -12,7 +12,7 @@ fn shared_validity() { let validity = Bitmap::from([true]); let array = BinaryArray::::new( DataType::Binary, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), b"a".to_vec().into(), Some(validity.clone()), ); @@ -25,7 +25,7 @@ fn shared_values() { let values: Buffer = b"a".to_vec().into(); let array = BinaryArray::::new( DataType::Binary, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -39,7 +39,7 @@ fn shared_offsets_values() { let values: Buffer = b"a".to_vec().into(); let array = BinaryArray::::new( DataType::Binary, - offsets.clone(), + offsets.clone().try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -52,7 +52,7 @@ fn shared_offsets() { let offsets: Buffer = vec![0, 1].into(); let array = BinaryArray::::new( DataType::Binary, - offsets.clone(), + offsets.clone().try_into().unwrap(), b"a".to_vec().into(), Some(Bitmap::from([true])), ); diff --git a/tests/it/array/equal/list.rs b/tests/it/array/equal/list.rs index 17427946383..67a458017b4 100644 --- a/tests/it/array/equal/list.rs +++ b/tests/it/array/equal/list.rs @@ -1,6 +1,5 @@ use arrow2::array::{Int32Array, ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; use arrow2::bitmap::Bitmap; -use arrow2::buffer::Buffer; use arrow2::datatypes::DataType; use super::test_equal; @@ -67,7 +66,7 @@ fn test_list_offsets() { #[test] fn test_bla() { - let offsets = Buffer::from(vec![0, 3, 3, 6]); + let offsets = vec![0, 3, 3, 6].try_into().unwrap(); let data_type = ListArray::::default_datatype(DataType::Int32); let values = Box::new(Int32Array::from([ Some(1), @@ -81,7 +80,7 @@ fn test_bla() { let lhs = ListArray::::from_data(data_type, offsets, values, Some(validity)); let lhs = lhs.slice(1, 2); - let offsets = Buffer::from(vec![0, 0, 3]); + let offsets = vec![0, 0, 3].try_into().unwrap(); let data_type = ListArray::::default_datatype(DataType::Int32); let values = Box::new(Int32Array::from([Some(4), None, Some(6)])); let validity = Bitmap::from([false, true]); diff --git a/tests/it/array/list/mod.rs b/tests/it/array/list/mod.rs index ef07860edb2..cf8aec30f9f 100644 --- a/tests/it/array/list/mod.rs +++ b/tests/it/array/list/mod.rs @@ -12,7 +12,7 @@ fn debug() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 3, 5]), + vec![0, 2, 2, 3, 5].try_into().unwrap(), Box::new(values), None, ); @@ -29,7 +29,7 @@ fn test_nested_panic() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type.clone(), - Buffer::from(vec![0, 2, 2, 3, 5]), + vec![0, 2, 2, 3, 5].try_into().unwrap(), Box::new(values), None, ); @@ -38,7 +38,7 @@ fn test_nested_panic() { // the nested structure of the child data let _ = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4]), + vec![0, 2, 4].try_into().unwrap(), Box::new(array), None, ); @@ -52,7 +52,7 @@ fn test_nested_display() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8, 10]), + vec![0, 2, 4, 7, 7, 8, 10].try_into().unwrap(), Box::new(values), None, ); @@ -60,7 +60,7 @@ fn test_nested_display() { let data_type = ListArray::::default_datatype(array.data_type().clone()); let nested = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 5, 6]), + vec![0, 2, 5, 6].try_into().unwrap(), Box::new(array), None, ); diff --git a/tests/it/array/list/mutable.rs b/tests/it/array/list/mutable.rs index 63cf3f8c1e5..4d7483fe31a 100644 --- a/tests/it/array/list/mutable.rs +++ b/tests/it/array/list/mutable.rs @@ -21,7 +21,7 @@ fn basics() { let data_type = ListArray::::default_datatype(DataType::Int32); let expected = ListArray::::from_data( data_type, - Buffer::from(vec![0, 3, 3, 6]), + vec![0, 3, 3, 6].try_into().unwrap(), Box::new(values), Some(Bitmap::from([true, false, true])), ); @@ -32,7 +32,7 @@ fn basics() { fn with_capacity() { let array = MutableListArray::>::with_capacity(10); assert!(array.offsets().capacity() >= 10); - assert_eq!(array.offsets().len(), 1); + assert_eq!(array.offsets().len(), 0); assert_eq!(array.values().values().capacity(), 0); assert_eq!(array.validity(), None); } @@ -45,7 +45,7 @@ fn push() { .unwrap(); assert_eq!(array.len(), 1); assert_eq!(array.values().values().as_ref(), [1, 2, 3]); - assert_eq!(array.offsets().as_ref(), [0, 3]); + assert_eq!(array.offsets().as_slice(), [0, 3]); assert_eq!(array.validity(), None); } diff --git a/tests/it/array/map/mod.rs b/tests/it/array/map/mod.rs index 38fde84367e..f58936dc3f6 100644 --- a/tests/it/array/map/mod.rs +++ b/tests/it/array/map/mod.rs @@ -20,7 +20,12 @@ fn basics() { None, ); - let array = MapArray::new(data_type, vec![0, 1, 2].into(), Box::new(field), None); + let array = MapArray::new( + data_type, + vec![0, 1, 2].try_into().unwrap(), + Box::new(field), + None, + ); assert_eq!( array.value(0), diff --git a/tests/it/array/utf8/mod.rs b/tests/it/array/utf8/mod.rs index 8c9b41e8b92..e60a9e28b83 100644 --- a/tests/it/array/utf8/mod.rs +++ b/tests/it/array/utf8/mod.rs @@ -1,4 +1,7 @@ -use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; +use arrow2::{ + array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result, + offset::OffsetsBuffer, +}; mod mutable; mod mutable_values; @@ -60,8 +63,8 @@ fn from() { fn from_slice() { let b = Utf8Array::::from_slice(["a", "b", "cc"]); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -72,8 +75,8 @@ fn from_slice() { fn from_iter_values() { let b = Utf8Array::::from_iter_values(["a", "b", "cc"].iter()); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -85,8 +88,8 @@ fn from_trusted_len_iter() { let b = Utf8Array::::from_trusted_len_iter(vec![Some("a"), Some("b"), Some("cc")].into_iter()); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -102,8 +105,8 @@ fn try_from_trusted_len_iter() { ) .unwrap(); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -112,59 +115,38 @@ fn try_from_trusted_len_iter() { #[test] fn not_utf8() { - let offsets = Buffer::from(vec![0, 4]); - let values = Buffer::from(vec![0, 159, 146, 150]); // invalid utf8 + let offsets = vec![0, 4].try_into().unwrap(); + let values = vec![0, 159, 146, 150].into(); // invalid utf8 assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] fn not_utf8_individually() { - let offsets = Buffer::from(vec![0, 1, 2]); - let values = Buffer::from(vec![207, 128]); // each is invalid utf8, but together is valid - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn wrong_offsets() { - let offsets = Buffer::from(vec![0, 5, 4]); // invalid offsets - let values = Buffer::from(b"abbbbb".to_vec()); + let offsets = vec![0, 1, 2].try_into().unwrap(); + let values = vec![207, 128].into(); // each is invalid utf8, but together is valid assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] fn wrong_data_type() { - let offsets = Buffer::from(vec![0, 4]); - let values = Buffer::from(b"abbb".to_vec()); + let offsets = vec![0, 4].try_into().unwrap(); + let values = b"abbb".to_vec().into(); assert!(Utf8Array::::try_new(DataType::Int32, offsets, values, None).is_err()); } #[test] fn out_of_bounds_offsets_panics() { // the 10 is out of bounds - let offsets = Buffer::from(vec![0, 10, 11]); - let values = Buffer::from(b"abbb".to_vec()); - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn decreasing_offset_and_ascii_panics() { - let offsets = Buffer::from(vec![0, 2, 1]); - let values = Buffer::from(b"abbb".to_vec()); - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn decreasing_offset_and_utf8_panics() { - let offsets = Buffer::from(vec![0, 2, 4, 2]); // not increasing - let values = Buffer::from(vec![207, 128, 207, 128, 207, 128]); // valid utf8 + let offsets = vec![0, 10, 11].try_into().unwrap(); + let values = b"abbb".to_vec().into(); assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] #[should_panic] fn index_out_of_bounds_panics() { - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abbb".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abbb".to_vec().into(); let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); array.value(3); @@ -179,7 +161,7 @@ fn debug() { #[test] fn into_mut_1() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = values.clone(); // cloned values assert_eq!(a, values); @@ -189,8 +171,8 @@ fn into_mut_1() { #[test] fn into_mut_2() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let a = offsets.clone(); // cloned offsets assert_eq!(a, offsets); let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); @@ -199,8 +181,8 @@ fn into_mut_2() { #[test] fn into_mut_3() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let validity = Some([true].into()); let a = validity.clone(); // cloned validity assert_eq!(a, validity); @@ -210,8 +192,8 @@ fn into_mut_3() { #[test] fn into_mut_4() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let validity = Some([true].into()); let array = Utf8Array::::new(DataType::Utf8, offsets, values, validity); assert!(array.into_mut().is_right()); diff --git a/tests/it/array/utf8/mutable.rs b/tests/it/array/utf8/mutable.rs index 57c188fb808..faa4868a58f 100644 --- a/tests/it/array/utf8/mutable.rs +++ b/tests/it/array/utf8/mutable.rs @@ -7,7 +7,7 @@ fn capacities() { let b = MutableUtf8Array::::with_capacities(1, 10); assert!(b.values().capacity() >= 10); - assert!(b.offsets().capacity() >= 2); + assert!(b.offsets().capacity() >= 1); } #[test] @@ -69,24 +69,15 @@ fn pop_all_some() { #[test] #[should_panic] fn not_utf8() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![0, 159, 146, 150]; // invalid utf8 MutableUtf8Array::::from_data(DataType::Utf8, offsets, values, None); } -/// Safety guarantee -#[test] -#[should_panic] -fn wrong_offsets() { - let offsets = vec![0, 5, 4]; // invalid offsets - let values = vec![0, 1, 2, 3, 4, 5]; - MutableUtf8Array::::from_data(DataType::Utf8, offsets, values, None); -} - #[test] #[should_panic] fn wrong_data_type() { - let offsets = vec![0, 4]; // invalid offsets + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![1, 2, 3, 4]; MutableUtf8Array::::from_data(DataType::Int8, offsets, values, None); } diff --git a/tests/it/array/utf8/mutable_values.rs b/tests/it/array/utf8/mutable_values.rs index 1ad783b607d..6bf04726a36 100644 --- a/tests/it/array/utf8/mutable_values.rs +++ b/tests/it/array/utf8/mutable_values.rs @@ -7,35 +7,28 @@ fn capacity() { let mut b = MutableUtf8ValuesArray::::with_capacity(100); assert_eq!(b.values().capacity(), 0); - assert!(b.offsets().capacity() >= 101); + assert!(b.offsets().capacity() >= 100); b.shrink_to_fit(); - assert!(b.offsets().capacity() < 101); -} - -#[test] -fn offsets_must_be_monotonic_increasing() { - let offsets = vec![0, 5, 4]; - let values = b"abbbbb".to_vec(); - assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); + assert!(b.offsets().capacity() < 100); } #[test] fn offsets_must_be_in_bounds() { - let offsets = vec![0, 10]; + let offsets = vec![0, 10].try_into().unwrap(); let values = b"abbbbb".to_vec(); assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); } #[test] fn data_type_must_be_consistent() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = b"abbb".to_vec(); assert!(MutableUtf8ValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); } #[test] fn must_be_utf8() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![0, 159, 146, 150]; assert!(std::str::from_utf8(&values).is_err()); assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); @@ -43,7 +36,7 @@ fn must_be_utf8() { #[test] fn as_box() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); let _ = b.as_box(); @@ -51,7 +44,7 @@ fn as_box() { #[test] fn as_arc() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); let _ = b.as_arc(); @@ -59,12 +52,12 @@ fn as_arc() { #[test] fn extend_trusted_len() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); - let offsets = vec![0, 2, 3, 4]; + let offsets = vec![0, 2, 3, 4].try_into().unwrap(); let values = b"abab".to_vec(); assert_eq!( b.as_box(), @@ -78,7 +71,7 @@ fn extend_trusted_len() { fn from_trusted_len() { let mut b = MutableUtf8ValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); - let offsets = vec![0, 1, 2]; + let offsets = vec![0, 1, 2].try_into().unwrap(); let values = b"ab".to_vec(); assert_eq!( b.as_box(), @@ -90,7 +83,7 @@ fn from_trusted_len() { #[test] fn extend_from_iter() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); @@ -98,7 +91,7 @@ fn extend_from_iter() { let a = b.clone(); b.extend_trusted_len(a.iter()); - let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let offsets = vec![0, 2, 3, 4, 6, 7, 8].try_into().unwrap(); let values = b"abababab".to_vec(); assert_eq!( b.as_box(), diff --git a/tests/it/array/utf8/to_mutable.rs b/tests/it/array/utf8/to_mutable.rs index c4c822b62d8..97ee0fb2055 100644 --- a/tests/it/array/utf8/to_mutable.rs +++ b/tests/it/array/utf8/to_mutable.rs @@ -1,4 +1,6 @@ -use arrow2::{array::Utf8Array, bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; +use arrow2::{ + array::Utf8Array, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, offset::OffsetsBuffer, +}; #[test] fn not_shared() { @@ -12,7 +14,7 @@ fn shared_validity() { let validity = Bitmap::from([true]); let array = Utf8Array::::new( DataType::Utf8, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), b"a".to_vec().into(), Some(validity.clone()), ); @@ -25,7 +27,7 @@ fn shared_values() { let values: Buffer = b"a".to_vec().into(); let array = Utf8Array::::new( DataType::Utf8, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -35,7 +37,7 @@ fn shared_values() { #[test] #[allow(clippy::redundant_clone)] fn shared_offsets_values() { - let offsets: Buffer = vec![0, 1].into(); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let values: Buffer = b"a".to_vec().into(); let array = Utf8Array::::new( DataType::Utf8, @@ -49,7 +51,7 @@ fn shared_offsets_values() { #[test] #[allow(clippy::redundant_clone)] fn shared_offsets() { - let offsets: Buffer = vec![0, 1].into(); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let array = Utf8Array::::new( DataType::Utf8, offsets.clone(), diff --git a/tests/it/compute/take.rs b/tests/it/compute/take.rs index 0e1719fb4eb..75b55d76f53 100644 --- a/tests/it/compute/take.rs +++ b/tests/it/compute/take.rs @@ -176,7 +176,7 @@ fn list_with_no_none() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 6, 9, 10]), + vec![0, 2, 2, 6, 9, 10].try_into().unwrap(), Box::new(values), None, ); @@ -189,7 +189,7 @@ fn list_with_no_none() { let expected_type = ListArray::::default_datatype(DataType::Int32); let expected = ListArray::::from_data( expected_type, - Buffer::from(vec![0, 1, 1, 4]), + vec![0, 1, 1, 4].try_into().unwrap(), Box::new(expected_values), None, ); @@ -208,7 +208,7 @@ fn list_with_none() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 6, 9, 10]), + vec![0, 2, 2, 6, 9, 10].try_into().unwrap(), Box::new(values), Some(validity), ); @@ -267,7 +267,7 @@ fn test_nested() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8, 10]), + vec![0, 2, 4, 7, 7, 8, 10].try_into().unwrap(), Box::new(values), None, ); @@ -275,7 +275,7 @@ fn test_nested() { let data_type = ListArray::::default_datatype(array.data_type().clone()); let nested = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 5, 6]), + vec![0, 2, 5, 6].try_into().unwrap(), Box::new(array), None, ); @@ -290,7 +290,7 @@ fn test_nested() { let expected_data_type = ListArray::::default_datatype(DataType::Int32); let expected_array = ListArray::::from_data( expected_data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8]), + vec![0, 2, 4, 7, 7, 8].try_into().unwrap(), Box::new(expected_values), None, ); @@ -298,7 +298,7 @@ fn test_nested() { let expected_data_type = ListArray::::default_datatype(expected_array.data_type().clone()); let expected = ListArray::::from_data( expected_data_type, - Buffer::from(vec![0, 2, 5]), + vec![0, 2, 5].try_into().unwrap(), Box::new(expected_array), None, ); diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index bc87eda68c8..165c7107926 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -115,7 +115,7 @@ fn utf8_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).slice(1, 3); let data = Utf8Array::::try_new( DataType::Utf8, - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), b"ab".to_vec().into(), Some(bitmap), )?; @@ -146,7 +146,7 @@ fn binary_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).slice(1, 3); let data = BinaryArray::::try_new( DataType::Binary, - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), b"ab".to_vec().into(), Some(bitmap), )?; @@ -213,7 +213,7 @@ fn list_sliced() -> Result<()> { let array = ListArray::::try_new( DataType::List(Box::new(Field::new("a", DataType::Int32, true))), - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), Box::new(PrimitiveArray::::from_vec(vec![1, 2])), Some(bitmap), )?; diff --git a/tests/it/io/avro/write.rs b/tests/it/io/avro/write.rs index c47b1b9782e..7cff7740fbb 100644 --- a/tests/it/io/avro/write.rs +++ b/tests/it/io/avro/write.rs @@ -86,7 +86,7 @@ pub(super) fn data() -> Chunk> { ])), Box::new(ListArray::::new( list_dt, - vec![0, 2, 5].into(), + vec![0, 2, 5].try_into().unwrap(), Box::new(PrimitiveArray::::from([ None, Some(1), @@ -98,7 +98,7 @@ pub(super) fn data() -> Chunk> { )), Box::new(ListArray::::new( list_dt1, - vec![0, 2, 2].into(), + vec![0, 2, 2].try_into().unwrap(), Box::new(PrimitiveArray::::from([None, Some(1)])), Some([true, false].into()), )), diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 8555f0081c6..3b0695cf631 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -321,7 +321,7 @@ fn list_of_struct() -> Result<()> { // [{"c11": 5, "c12": {"c121": "g"}}] let c1 = ListArray::::from_data( c1_datatype, - Buffer::from(vec![0, 2, 2, 3]), + Buffer::from(vec![0, 2, 2, 3]).try_into().unwrap(), Box::new(s), Some(Bitmap::from_u8_slice([0b00000101], 3)), ); diff --git a/tests/it/io/ndjson/mod.rs b/tests/it/io/ndjson/mod.rs index 632d3015383..b2e2b2fc895 100644 --- a/tests/it/io/ndjson/mod.rs +++ b/tests/it/io/ndjson/mod.rs @@ -2,7 +2,6 @@ mod read; use arrow2::array::*; use arrow2::bitmap::Bitmap; -use arrow2::buffer::Buffer; use arrow2::datatypes::*; use arrow2::error::Result; use arrow2::io::ndjson::write as ndjson_write; @@ -286,7 +285,7 @@ fn case_nested_list() -> (String, Box) { ); let expected = ListArray::from_data( a_list_data_type, - Buffer::from(vec![0i32, 2, 3, 6, 6, 6]), + vec![0i32, 2, 3, 6, 6, 6].try_into().unwrap(), a_struct.boxed(), Some(Bitmap::from_u8_slice([0b00010111], 5)), ); diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 06afa74fc95..4790479cc2f 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -3,7 +3,6 @@ use std::io::{Cursor, Read, Seek}; use arrow2::{ array::*, bitmap::Bitmap, - buffer::Buffer, chunk::Chunk, datatypes::*, error::Result, @@ -75,7 +74,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { // [["a", "b", None, "c"]] let a = ListArray::::new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - vec![0, 4].into(), + vec![0, 4].try_into().unwrap(), Utf8Array::::from([Some("a"), Some("b"), None, Some("c")]).boxed(), None, ); @@ -91,7 +90,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { } pub fn pyarrow_nested_nullable(column: &str) -> Box { - let offsets = Buffer::from(vec![0, 2, 2, 5, 8, 8, 11, 11, 12]); + let offsets = vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(); let values = match column { "list_int64" => { @@ -582,7 +581,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { array.data_type().clone(), nullable, ))), - vec![0, array.len() as i32].into(), + vec![0, array.len() as i32].try_into().unwrap(), array, None, ) @@ -685,7 +684,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { array.data_type().clone(), true, ))), - vec![0, array.len() as i32].into(), + vec![0, array.len() as i32].try_into().unwrap(), array, None, ) @@ -990,7 +989,7 @@ pub fn pyarrow_map(column: &str) -> Box { ]); MapArray::try_new( DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), - vec![0, 2].into(), + vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, vec![ @@ -1015,7 +1014,7 @@ pub fn pyarrow_map(column: &str) -> Box { ]); MapArray::try_new( DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), - vec![0, 2].into(), + vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, vec![ @@ -1047,7 +1046,7 @@ pub fn pyarrow_map_statistics(column: &str) -> Statistics { Box::new(Field::new("items", DataType::Struct(fields.clone()), false)), false, ), - vec![0, arrays[0].len() as i32].into(), + vec![0, arrays[0].len() as i32].try_into().unwrap(), StructArray::new(DataType::Struct(fields), arrays, None).boxed(), None, ) @@ -1511,7 +1510,7 @@ fn nested_dict_data(data_type: DataType) -> Result<(Schema, Chunk values.data_type().clone(), false, ))), - vec![0i32, 0, 0, 2, 3].into(), + vec![0i32, 0, 0, 2, 3].try_into().unwrap(), values.boxed(), Some([true, false, true, true].into()), )?; From 68f0aff0286a7fabdbba96cb08ea35e19f2d5198 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 6 Dec 2022 07:29:53 +0000 Subject: [PATCH 3/5] More improvements --- src/array/binary/mod.rs | 5 +- src/array/binary/mutable_values.rs | 5 +- src/array/growable/utf8.rs | 3 +- src/array/list/mod.rs | 24 +++-- src/array/map/mod.rs | 21 ++--- src/array/specification.rs | 136 +++++++++++++---------------- src/array/utf8/mod.rs | 9 +- src/array/utf8/mutable_values.rs | 9 +- src/offset.rs | 43 +++++++++ 9 files changed, 135 insertions(+), 120 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index b7f72cdbb20..a11469b220d 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -78,7 +78,7 @@ impl BinaryArray { values: Buffer, validity: Option, ) -> Result { - try_check_offsets_bounds(offsets.buffer(), values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() @@ -145,8 +145,7 @@ impl BinaryArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.buffer().get_unchecked(i).to_usize(); - let end = self.offsets.buffer().get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end_unchecked(i); // soundness: the invariant of the struct self.values.get_unchecked(start..end) diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 633c34d07ef..e52516877dd 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -66,7 +66,7 @@ impl MutableBinaryValuesArray { /// # Implementation /// This function is `O(1)` pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { - try_check_offsets_bounds(offsets.as_slice(), values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( @@ -166,8 +166,7 @@ impl MutableBinaryValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.as_slice().get_unchecked(i).to_usize(); - let end = self.offsets.as_slice().get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end(i); // soundness: the invariant of the struct self.values.get_unchecked(start..end) diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index 0aee209378c..5e901577901 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -52,8 +52,7 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { #[cfg(debug_assertions)] { - crate::array::specification::try_check_offsets_and_utf8(offsets.as_slice(), &values) - .unwrap(); + crate::array::specification::try_check_utf8(&offsets, &values).unwrap(); } unsafe { diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 9a4c82e2a02..7c7e96c9c04 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -41,7 +41,7 @@ impl ListArray { values: Box, validity: Option, ) -> Result { - try_check_offsets_bounds(offsets.buffer(), values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() @@ -185,16 +185,13 @@ impl ListArray { } /// Returns the element at index `i` + /// # Panic + /// Panics iff `i >= self.len()` #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets.buffer()[i]; - let offset_1 = self.offsets.buffer()[i + 1]; - let length = (offset_1 - offset).to_usize(); - - // Safety: - // One of the invariants of the struct - // is that offsets are in bounds - unsafe { self.values.slice_unchecked(offset.to_usize(), length) } + assert!(i < self.len()); + // Safety: invariant of this function + unsafe { self.value_unchecked(i) } } /// Returns the element at index `i` as &str @@ -202,11 +199,12 @@ impl ListArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.buffer().get_unchecked(i); - let offset_1 = *self.offsets.buffer().get_unchecked(i + 1); - let length = (offset_1 - offset).to_usize(); + // safety: the invariant of the function + let (start, end) = self.offsets.start_end_unchecked(i); + let length = end - start; - self.values.slice_unchecked(offset.to_usize(), length) + // safety: the invariant of the struct + self.values.slice_unchecked(start, length) } /// The optional validity. diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index d087a99ac3f..fe8adb1deab 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -37,7 +37,7 @@ impl MapArray { field: Box, validity: Option, ) -> Result { - try_check_offsets_bounds(offsets.buffer(), field.len())?; + try_check_offsets_bounds(&offsets, field.len())?; let inner_field = Self::try_get_field(&data_type)?; if let DataType::Struct(inner) = inner_field.data_type() { @@ -213,14 +213,8 @@ impl MapArray { /// Returns the element at index `i`. #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets.buffer()[i]; - let offset_1 = self.offsets.buffer()[i + 1]; - let length = (offset_1 - offset) as usize; - - // Safety: - // One of the invariants of the struct - // is that offsets are in bounds - unsafe { self.field.slice_unchecked(offset as usize, length) } + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } } /// Returns the element at index `i`. @@ -228,11 +222,12 @@ impl MapArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.buffer().get_unchecked(i); - let offset_1 = *self.offsets.buffer().get_unchecked(i + 1); - let length = (offset_1 - offset) as usize; + // soundness: the invariant of the function + let (start, end) = self.offsets.start_end_unchecked(i); + let length = end - start; - self.field.slice_unchecked(offset as usize, length) + // soundness: the invariant of the struct + self.field.slice_unchecked(start, length) } } diff --git a/src/array/specification.rs b/src/array/specification.rs index 521459a4b76..3a53f9c1d54 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -1,96 +1,80 @@ use crate::error::{Error, Result}; -use crate::types::Offset; +use crate::offset::{Offset, Offsets, OffsetsBuffer}; -pub fn try_check_offsets_bounds(offsets: &[O], values_len: usize) -> Result { - if let Some(last_offset) = offsets.last() { - if last_offset.to_usize() > values_len { - Err(Error::oos("offsets must not exceed the values length")) - } else { - Ok(last_offset.to_usize()) - } +/// Helper trait to support `Offset` and `OffsetBuffer` +pub(crate) trait OffsetsContainer { + fn last(&self) -> usize; + fn starts(&self) -> &[O]; +} + +impl OffsetsContainer for OffsetsBuffer { + #[inline] + fn last(&self) -> usize { + self.last().to_usize() + } + + #[inline] + fn starts(&self) -> &[O] { + let last = self.buffer().len() - 1; + unsafe { self.buffer().get_unchecked(0..last) } + } +} + +impl OffsetsContainer for Offsets { + #[inline] + fn last(&self) -> usize { + self.last().to_usize() + } + + #[inline] + fn starts(&self) -> &[O] { + let offsets = self.as_slice(); + let last = offsets.len() - 1; + unsafe { offsets.get_unchecked(0..last) } + } +} + +pub(crate) fn try_check_offsets_bounds>( + offsets: &C, + values_len: usize, +) -> Result<()> { + if offsets.last() > values_len { + Err(Error::oos("offsets must not exceed the values length")) } else { - Err(Error::oos("offsets must have at least one element")) + Ok(()) } } -/// # Panics iff: -/// * the `offsets` is not monotonically increasing, or -/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or +/// # Error /// * any offset is larger or equal to `values_len`. -pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Result<()> { +/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or +pub(crate) fn try_check_utf8>( + offsets: &C, + values: &[u8], +) -> Result<()> { + try_check_offsets_bounds(offsets, values.len())?; + if values.is_ascii() { - try_check_offsets(offsets, values.len()) + Ok(()) } else { simdutf8::basic::from_utf8(values)?; - for window in offsets.windows(2) { - let start = window[0].to_usize(); - let end = window[1].to_usize(); + for start in offsets.starts() { + let start = start.to_usize(); - // check monotonicity - if start > end { - return Err(Error::oos("offsets must be monotonically increasing")); - } - - let first = values.get(start); + // Safety: `try_check_offsets_bounds` just checked for bounds + let b = *unsafe { values.get_unchecked(start) }; - if let Some(&b) = first { - // A valid code-point iff it does not start with 0b10xxxxxx - // Bit-magic taken from `std::str::is_char_boundary` - if (b as i8) < -0x40 { - return Err(Error::oos("Non-valid char boundary detected")); - } + // A valid code-point iff it does not start with 0b10xxxxxx + // Bit-magic taken from `std::str::is_char_boundary` + if (b as i8) < -0x40 { + return Err(Error::oos("Non-valid char boundary detected")); } } - // check bounds - if offsets - .last() - .map_or(true, |last| last.to_usize() > values.len()) - { - return Err(Error::oos( - "offsets must have at least one element and must not exceed values length", - )); - }; - Ok(()) } } -/// Checks that `offsets` is monotonically increasing, and all offsets are less than or equal to -/// `values_len`. -pub fn try_check_offsets(offsets: &[O], values_len: usize) -> Result<()> { - // this code is carefully constructed to auto-vectorize, don't change naively! - match offsets.first() { - None => Err(Error::oos("offsets must have at least one element")), - Some(first) => { - let mut previous = *first; - let mut any_invalid = false; - - // This loop will auto-vectorize because there is not any break, - // an invalid value will be returned once the whole offsets buffer is processed. - for offset in offsets { - if previous > *offset { - any_invalid = true - } - previous = *offset; - } - - if any_invalid { - Err(Error::oos("offsets must be monotonically increasing")) - } else if offsets - .last() - .map_or(true, |last| last.to_usize() > values_len) - { - Err(Error::oos( - "offsets must have at least one element and must not exceed values length", - )) - } else { - Ok(()) - } - } - } -} - pub fn check_indexes(keys: &[K], len: usize) -> Result<()> where K: std::fmt::Debug + Copy + TryInto, @@ -125,12 +109,12 @@ mod tests { fn check_utf8_validation(values in binary_strategy()) { for offset in 0..values.len() - 1 { - let offsets = vec![0, offset as i32, values.len() as i32]; + let offsets = vec![0, offset as i32, values.len() as i32].try_into().unwrap(); let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok(); is_valid &= std::str::from_utf8(&values[offset..]).is_ok(); - assert_eq!(try_check_offsets_and_utf8::(&offsets, &values).is_ok(), is_valid) + assert_eq!(try_check_utf8::>(&offsets, &values).is_ok(), is_valid) } } } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 330e56cd9df..36ce27b28bf 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -13,7 +13,7 @@ use crate::{ use either::Either; use super::{ - specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, + specification::{try_check_offsets_bounds, try_check_utf8}, Array, GenericBinaryArray, }; @@ -92,7 +92,7 @@ impl Utf8Array { values: Buffer, validity: Option, ) -> Result { - try_check_offsets_and_utf8(offsets.buffer(), &values)?; + try_check_utf8(&offsets, &values)?; if validity .as_ref() .map_or(false, |validity| validity.len() != offsets.len()) @@ -162,8 +162,7 @@ impl Utf8Array { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.buffer().get_unchecked(i).to_usize(); - let end = self.offsets.buffer().get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end_unchecked(i); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); @@ -389,7 +388,7 @@ impl Utf8Array { values: Buffer, validity: Option, ) -> Result { - try_check_offsets_bounds(offsets.buffer(), values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index 354633cfff1..47c6b9e3eec 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -2,7 +2,7 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, + specification::{try_check_offsets_bounds, try_check_utf8}, Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, @@ -81,7 +81,7 @@ impl MutableUtf8ValuesArray { /// # Implementation /// This function is `O(N)` - checking utf8 is `O(N)` pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { - try_check_offsets_and_utf8(offsets.as_slice(), &values)?; + try_check_utf8(&offsets, &values)?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", @@ -108,7 +108,7 @@ impl MutableUtf8ValuesArray { /// # Implementation /// This function is `O(1)` pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets, values: Vec) -> Self { - try_check_offsets_bounds(offsets.as_slice(), values.len()) + try_check_offsets_bounds(&offsets, values.len()) .expect("The length of the values must be equal to the last offset value"); if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { @@ -208,8 +208,7 @@ impl MutableUtf8ValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.as_slice().get_unchecked(i).to_usize(); - let end = self.offsets.as_slice().get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end(i); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); diff --git a/src/offset.rs b/src/offset.rs index 717214e7d9f..6d0878fa2b5 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -144,6 +144,27 @@ impl Offsets { } } + /// Returns a range (start, end) corresponding to the position `index` + /// # Panic + /// This function panics iff `index >= self.len()` + #[inline] + pub fn start_end(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + assert!(index < self.len()); + unsafe { self.start_end_unchecked(index) } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Safety + /// `index` must be `< self.len()` + #[inline] + pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + let start = self.0.get_unchecked(index).to_usize(); + let end = self.0.get_unchecked(index + 1).to_usize(); + (start, end) + } + /// Returns the length of this container #[inline] pub fn len(&self) -> usize { @@ -302,6 +323,7 @@ fn try_check_offsets(offsets: &[O]) -> Result<(), Error> { /// A wrapper type of [`Buffer`] that is guaranteed to: /// * Always contain an element +/// * Every element is `>0` /// * element at position `i` is >= than element at position `i-1`. #[derive(Clone, PartialEq, Debug)] pub struct OffsetsBuffer(Buffer); @@ -368,6 +390,27 @@ impl OffsetsBuffer { } } + /// Returns a range (start, end) corresponding to the position `index` + /// # Panic + /// This function panics iff `index >= self.len()` + #[inline] + pub fn start_end(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + assert!(index < self.len()); + unsafe { self.start_end_unchecked(index) } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Safety + /// `index` must be `< self.len()` + #[inline] + pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + let start = self.0.get_unchecked(index).to_usize(); + let end = self.0.get_unchecked(index + 1).to_usize(); + (start, end) + } + /// Returns a new [`OffsetsBuffer`] that is a slice of this buffer starting at `offset`. /// Doing so allows the same memory region to be shared between buffers. /// # Safety From 3c9c08135cbf1b5d72405b2e6587f6774485b83e Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 7 Dec 2022 22:18:13 +0000 Subject: [PATCH 4/5] Optimized utf8 checking --- src/array/specification.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/array/specification.rs b/src/array/specification.rs index 3a53f9c1d54..966df9f49f2 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -59,7 +59,18 @@ pub(crate) fn try_check_utf8>( } else { simdutf8::basic::from_utf8(values)?; - for start in offsets.starts() { + // offsets can be == values.len() + // let's find first offset from the end that is different + let starts = offsets.starts(); + let last = starts + .iter() + .rev() + .enumerate() + .find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1)) + .unwrap_or(starts.len() - 1); + + let mut any_invalid = false; + for start in &starts[..=last] { let start = start.to_usize(); // Safety: `try_check_offsets_bounds` just checked for bounds @@ -68,9 +79,12 @@ pub(crate) fn try_check_utf8>( // A valid code-point iff it does not start with 0b10xxxxxx // Bit-magic taken from `std::str::is_char_boundary` if (b as i8) < -0x40 { - return Err(Error::oos("Non-valid char boundary detected")); + any_invalid = true } } + if any_invalid { + return Err(Error::oos("Non-valid char boundary detected")); + } Ok(()) } } From c150c7c3cb9b25e5db12d79076647275e8561f52 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 7 Dec 2022 22:46:40 +0000 Subject: [PATCH 5/5] Improved perf of take --- src/array/binary/mod.rs | 2 +- src/array/specification.rs | 50 ++++++++++++++++++++---------- src/array/utf8/mod.rs | 2 +- src/compute/take/generic_binary.rs | 33 +++++++------------- src/offset.rs | 6 ++++ tests/it/io/ipc/read/file.rs | 6 ++++ 6 files changed, 59 insertions(+), 40 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index a11469b220d..1a5abdcc330 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -40,7 +40,7 @@ pub use mutable::*; /// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); /// // the underlying representation: /// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// ``` /// diff --git a/src/array/specification.rs b/src/array/specification.rs index 966df9f49f2..021cbd5c80c 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -4,7 +4,7 @@ use crate::offset::{Offset, Offsets, OffsetsBuffer}; /// Helper trait to support `Offset` and `OffsetBuffer` pub(crate) trait OffsetsContainer { fn last(&self) -> usize; - fn starts(&self) -> &[O]; + fn as_slice(&self) -> &[O]; } impl OffsetsContainer for OffsetsBuffer { @@ -14,9 +14,8 @@ impl OffsetsContainer for OffsetsBuffer { } #[inline] - fn starts(&self) -> &[O] { - let last = self.buffer().len() - 1; - unsafe { self.buffer().get_unchecked(0..last) } + fn as_slice(&self) -> &[O] { + self.buffer() } } @@ -27,10 +26,8 @@ impl OffsetsContainer for Offsets { } #[inline] - fn starts(&self) -> &[O] { - let offsets = self.as_slice(); - let last = offsets.len() - 1; - unsafe { offsets.get_unchecked(0..last) } + fn as_slice(&self) -> &[O] { + self.as_slice() } } @@ -52,6 +49,10 @@ pub(crate) fn try_check_utf8>( offsets: &C, values: &[u8], ) -> Result<()> { + if offsets.as_slice().len() == 1 { + return Ok(()); + } + try_check_offsets_bounds(offsets, values.len())?; if values.is_ascii() { @@ -60,17 +61,34 @@ pub(crate) fn try_check_utf8>( simdutf8::basic::from_utf8(values)?; // offsets can be == values.len() - // let's find first offset from the end that is different - let starts = offsets.starts(); - let last = starts + // find first offset from the end that is smaller + // Example: + // values.len() = 10 + // offsets = [0, 5, 10, 10] + let offsets = offsets.as_slice(); + let last = offsets .iter() - .rev() .enumerate() - .find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1)) - .unwrap_or(starts.len() - 1); + .skip(1) + .rev() + .find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i)); + + let last = if let Some(last) = last { + // following the example: last = 1 (offset = 5) + last + } else { + // given `l = values.len()`, this branch is hit iff either: + // * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above + // * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above + return Ok(()); + }; + + // trucate to relevant offsets. Note: `=last` because last was computed skipping the first item + // following the example: starts = [0, 5] + let starts = unsafe { offsets.get_unchecked(..=last) }; let mut any_invalid = false; - for start in &starts[..=last] { + for start in starts { let start = start.to_usize(); // Safety: `try_check_offsets_bounds` just checked for bounds @@ -117,7 +135,7 @@ mod tests { proptest! { // a bit expensive, feel free to run it when changing the code above - //#![proptest_config(ProptestConfig::with_cases(100000))] + // #![proptest_config(ProptestConfig::with_cases(100000))] #[test] #[cfg_attr(miri, ignore)] // miri and proptest do not work well fn check_utf8_validation(values in binary_strategy()) { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 36ce27b28bf..f8b8b86a8b8 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -51,7 +51,7 @@ impl> AsRef<[u8]> for StrAsBytes { /// // the underlying representation /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 4fc4d01138d..a9cf9c199c2 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -17,10 +17,10 @@ pub fn take_values( let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.buffer().windows(2)) - .for_each(|(start_, window)| { - let start = start_.to_usize(); - let end = (*start_ + (window[1] - window[0])).to_usize(); + .map(|start| start.to_usize()) + .zip(offsets.lengths()) + .for_each(|(start, length)| { + let end = start + length; buffer.extend_from_slice(&values[start..end]); }); buffer.into() @@ -32,27 +32,16 @@ pub fn take_no_validity( values: &[u8], indices: &[I], ) -> (OffsetsBuffer, Buffer, Option) { - let mut length = O::zero(); let mut buffer = Vec::::new(); - let offsets = offsets.buffer(); - let offsets = indices.iter().map(|index| { - let index = index.to_usize(); - let start = offsets[index]; - let length_h = offsets[index + 1] - start; - length += length_h; - - let _start = start.to_usize(); - let end = (start + length_h).to_usize(); - buffer.extend_from_slice(&values[_start..end]); - length + let lengths = indices.iter().map(|index| index.to_usize()).map(|index| { + let (start, end) = offsets.start_end(index); + // todo: remove this bound check + buffer.extend_from_slice(&values[start..end]); + end - start }); - let offsets = std::iter::once(O::zero()) - .chain(offsets) - .collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); + let offsets = Offsets::try_from_lengths(lengths).expect(""); - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls diff --git a/src/offset.rs b/src/offset.rs index 6d0878fa2b5..2337f082218 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -420,6 +420,12 @@ impl OffsetsBuffer { Self(self.0.slice_unchecked(offset, length)) } + /// Returns an iterator with the lengths of the offsets + #[inline] + pub fn lengths(&self) -> impl Iterator + '_ { + self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) + } + /// Returns the inner [`Buffer`]. #[inline] pub fn into_inner(self) -> Buffer { diff --git a/tests/it/io/ipc/read/file.rs b/tests/it/io/ipc/read/file.rs index 9d21d051f2a..515a6ede92f 100644 --- a/tests/it/io/ipc/read/file.rs +++ b/tests/it/io/ipc/read/file.rs @@ -106,6 +106,12 @@ fn read_generated_100_decimal() -> Result<()> { test_file("1.0.0-bigendian", "generated_decimal") } +#[test] +fn read_generated_duplicate_fieldnames() -> Result<()> { + test_file("1.0.0-littleendian", "generated_duplicate_fieldnames")?; + test_file("1.0.0-bigendian", "generated_duplicate_fieldnames") +} + #[test] fn read_generated_100_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval")?;