From 73077ea3a0d23dbc476fc5cc25fb11870f492d87 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 11:37:59 +0200 Subject: [PATCH 1/9] Add SparsUnion support, Add interval datatype --- marrow/src/array.rs | 36 +++++- marrow/src/datatypes.rs | 58 +++++++++- marrow/src/impl_arrow/impl_api_base.rs | 131 ++++++++++++++++------ marrow/src/impl_arrow2/impl.rs | 118 ++++++++++++++----- marrow/src/view.rs | 11 ++ test_with_arrow/src/lib.rs | 32 +++--- test_with_arrow/src/tests/union_arrays.rs | 90 ++++++++++++++- 7 files changed, 392 insertions(+), 84 deletions(-) diff --git a/marrow/src/array.rs b/marrow/src/array.rs index d810904..e562d96 100644 --- a/marrow/src/array.rs +++ b/marrow/src/array.rs @@ -7,7 +7,7 @@ use crate::{ view::{ BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, - StructView, TimeView, TimestampView, View, + SparseUnionView, StructView, TimeView, TimestampView, View, }, }; @@ -83,8 +83,10 @@ pub enum Array { Dictionary(DictionaryArray), /// An array of maps Map(MapArray), - /// An array of unions + /// An array of unions with compact memory layout DenseUnion(DenseUnionArray), + /// An array of unions + SparseUnion(SparseUnionArray), } impl Array { @@ -123,6 +125,7 @@ impl Array { Self::Map(array) => View::Map(array.as_view()), Self::Dictionary(array) => View::Dictionary(array.as_view()), Self::DenseUnion(array) => View::DenseUnion(array.as_view()), + Self::SparseUnion(array) => View::SparseUnion(array.as_view()), } } } @@ -498,7 +501,7 @@ impl DictionaryArray { } } -/// A union of different data types +/// A union of different data types with a compact representation /// /// This corresponds roughly to Rust's enums. Each element has a type, which indicates the /// underlying array to use. For fast lookups the offsets into the underlying arrays are stored as @@ -515,7 +518,8 @@ pub struct DenseUnionArray { } impl DenseUnionArray { - fn as_view(&self) -> DenseUnionView<'_> { + /// Get the view for this array + pub fn as_view(&self) -> DenseUnionView<'_> { DenseUnionView { types: &self.types, offsets: &self.offsets, @@ -527,3 +531,27 @@ impl DenseUnionArray { } } } + +/// A union of different data types with a less compact representation +/// +#[derive(Debug, Clone, PartialEq)] +pub struct SparseUnionArray { + /// The types of each element + pub types: Vec, + /// The arrays with their metadata + pub fields: Vec<(i8, FieldMeta, Array)>, +} + +impl SparseUnionArray { + /// Get the view for this array + pub fn as_view(&self) -> SparseUnionView<'_> { + SparseUnionView { + types: &self.types, + fields: self + .fields + .iter() + .map(|(type_id, meta, array)| (*type_id, meta.clone(), array.as_view())) + .collect(), + } + } +} diff --git a/marrow/src/datatypes.rs b/marrow/src/datatypes.rs index cecbcdd..1d4ca33 100644 --- a/marrow/src/datatypes.rs +++ b/marrow/src/datatypes.rs @@ -177,6 +177,8 @@ pub enum DataType { Time64(TimeUnit), /// Durations stored as `i64` with the given unit Duration(TimeUnit), + /// Calendar intervals with different layouts depending on the given unit + Interval(IntervalUnit), /// Fixed point values stored with the given precision and scale Decimal128(u8, i8), /// Structs @@ -280,7 +282,7 @@ impl std::fmt::Display for UnionMode { impl std::str::FromStr for UnionMode { type Err = MarrowError; - fn from_str(s: &str) -> std::result::Result { + fn from_str(s: &str) -> Result { match s { "Sparse" => Ok(UnionMode::Sparse), "Dense" => Ok(UnionMode::Dense), @@ -306,3 +308,57 @@ fn union_mode_as_str() { assert_variant!(Dense); assert_variant!(Sparse); } + +/// The unit of calendar intervals +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum IntervalUnit { + /// An interval as the number of months, stored as `i32` + YearMonth, + /// An interval as the number of days, stored as `i32`, and milliseconds, stored as `i32` + DayTime, + /// An interval as the number of months (stored as `i32`), days (stored as `i32`) and nanoseconds (stored as `i64`) + MonthDayNano, +} + +impl std::fmt::Display for IntervalUnit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::YearMonth => write!(f, "YearMonth"), + Self::DayTime => write!(f, "DayTime"), + Self::MonthDayNano => write!(f, "MonthDayNano"), + } + } +} + +impl std::str::FromStr for IntervalUnit { + type Err = MarrowError; + + fn from_str(s: &str) -> Result { + match s { + "YearMonth" => Ok(Self::YearMonth), + "DayTime" => Ok(Self::DayTime), + "MonthDayNano" => Ok(Self::MonthDayNano), + s => fail!(ErrorKind::ParseError, "Invalid IntervalUnit: {s}"), + } + } +} + +#[test] +fn interval_unit() { + use std::str::FromStr; + + macro_rules! assert_variant { + ($variant:ident) => { + assert_eq!((IntervalUnit::$variant).to_string(), stringify!($variant)); + assert_eq!( + IntervalUnit::from_str(stringify!($variant)).unwrap(), + IntervalUnit::$variant + ); + }; + } + + assert_variant!(YearMonth); + assert_variant!(DayTime); + assert_variant!(MonthDayNano); +} diff --git a/marrow/src/impl_arrow/impl_api_base.rs b/marrow/src/impl_arrow/impl_api_base.rs index 9aa63b6..1319150 100644 --- a/marrow/src/impl_arrow/impl_api_base.rs +++ b/marrow/src/impl_arrow/impl_api_base.rs @@ -5,12 +5,12 @@ use half::f16; use crate::{ array::Array, - datatypes::{meta_from_field, DataType, Field, FieldMeta, TimeUnit, UnionMode}, + datatypes::{meta_from_field, DataType, Field, FieldMeta, IntervalUnit, TimeUnit, UnionMode}, error::{fail, ErrorKind, MarrowError, Result}, view::{ BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, - FixedSizeListView, ListView, MapView, NullView, PrimitiveView, StructView, TimeView, - TimestampView, View, + FixedSizeListView, ListView, MapView, NullView, PrimitiveView, SparseUnionView, StructView, + TimeView, TimestampView, View, }, }; @@ -54,6 +54,7 @@ impl TryFrom<&arrow_schema::DataType> for DataType { tz.as_ref().map(|s| s.to_string()), )), AT::Duration(unit) => Ok(T::Duration(unit.clone().try_into()?)), + AT::Interval(unit) => Ok(T::Interval(unit.clone().try_into()?)), AT::Binary => Ok(T::Binary), AT::LargeBinary => Ok(T::LargeBinary), AT::FixedSizeBinary(n) => Ok(T::FixedSizeBinary(*n)), @@ -136,6 +137,7 @@ impl TryFrom<&DataType> for arrow_schema::DataType { tz.as_ref().map(|s| s.to_string().into()), )), T::Duration(unit) => Ok(AT::Duration((*unit).try_into()?)), + T::Interval(unit) => Ok(AT::Interval((*unit).try_into()?)), T::Binary => Ok(AT::Binary), T::LargeBinary => Ok(AT::LargeBinary), T::FixedSizeBinary(n) => Ok(AT::FixedSizeBinary(*n)), @@ -234,7 +236,7 @@ impl TryFrom for arrow_schema::UnionMode { } } -/// Converison to `arrow` arrays (*requires one of the `arrow-{version}` features*) +/// Conversion to `arrow` arrays (*requires one of the `arrow-{version}` features*) impl TryFrom for Arc { type Error = MarrowError; @@ -243,6 +245,32 @@ impl TryFrom for Arc { } } +/// Conversion from `arrow` interval units (*requires one of the `arrow2-{version}` features*) +impl TryFrom for IntervalUnit { + type Error = MarrowError; + + fn try_from(value: arrow_schema::IntervalUnit) -> Result { + match value { + arrow_schema::IntervalUnit::YearMonth => Ok(IntervalUnit::YearMonth), + arrow_schema::IntervalUnit::DayTime => Ok(IntervalUnit::DayTime), + arrow_schema::IntervalUnit::MonthDayNano => Ok(IntervalUnit::MonthDayNano), + } + } +} + +/// Conversion to `arrow` interval units (*requires one of the `arrow2-{version}` features*) +impl TryFrom for arrow_schema::IntervalUnit { + type Error = MarrowError; + + fn try_from(value: IntervalUnit) -> Result { + match value { + IntervalUnit::YearMonth => Ok(arrow_schema::IntervalUnit::YearMonth), + IntervalUnit::DayTime => Ok(arrow_schema::IntervalUnit::DayTime), + IntervalUnit::MonthDayNano => Ok(arrow_schema::IntervalUnit::MonthDayNano), + } + } +} + fn build_array_data(value: Array) -> Result { use Array as A; type ArrowF16 = @@ -466,17 +494,7 @@ fn build_array_data(value: Array) -> Result { )?) } A::DenseUnion(arr) => { - let mut fields = Vec::new(); - let mut child_data = Vec::new(); - - for (type_id, meta, array) in arr.fields { - let child = build_array_data(array)?; - let field = field_from_data_and_meta(&child, meta); - - fields.push((type_id, Arc::new(field))); - child_data.push(child); - } - + let (fields, child_data) = union_fields_into_fields_and_data(arr.fields)?; Ok(arrow_data::ArrayData::try_new( arrow_schema::DataType::Union( fields.into_iter().collect(), @@ -492,9 +510,43 @@ fn build_array_data(value: Array) -> Result { child_data, )?) } + A::SparseUnion(arr) => { + let (fields, child_data) = union_fields_into_fields_and_data(arr.fields)?; + Ok(arrow_data::ArrayData::try_new( + arrow_schema::DataType::Union( + fields.into_iter().collect(), + arrow_schema::UnionMode::Sparse, + ), + arr.types.len(), + None, + 0, + vec![arrow_buffer::ScalarBuffer::from(arr.types).into_inner()], + child_data, + )?) + } } } +fn union_fields_into_fields_and_data( + union_fields: Vec<(i8, FieldMeta, Array)>, +) -> Result<( + Vec<(i8, arrow_schema::FieldRef)>, + Vec, +)> { + let mut fields = Vec::new(); + let mut child_data = Vec::new(); + + for (type_id, meta, array) in union_fields { + let child = build_array_data(array)?; + let field = field_from_data_and_meta(&child, meta); + + fields.push((type_id, Arc::new(field))); + child_data.push(child); + } + + Ok((fields, child_data)) +} + /// Converison from `arrow` arrays (*requires one of the `arrow-{version}` features*) impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { type Error = MarrowError; @@ -801,13 +853,8 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { } else if let Some(array) = any.downcast_ref::() { use arrow_array::Array; - let arrow_schema::DataType::Union(union_fields, arrow_schema::UnionMode::Dense) = - array.data_type() - else { - fail!( - ErrorKind::Unsupported, - "Invalid data type: only dense unions are supported" - ); + let arrow_schema::DataType::Union(union_fields, mode) = array.data_type() else { + fail!(ErrorKind::Unsupported, "Invalid data type for UnionArray"); }; let mut fields = Vec::new(); @@ -816,18 +863,36 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { let view: View = array.child(type_id).as_ref().try_into()?; fields.push((type_id, meta, view)); } - let Some(offsets) = array.offsets() else { - fail!( - ErrorKind::Unsupported, - "Dense unions must have an offset array" - ); - }; - Ok(View::DenseUnion(DenseUnionView { - types: array.type_ids(), - offsets, - fields, - })) + match mode { + arrow_schema::UnionMode::Dense => { + let Some(offsets) = array.offsets() else { + fail!( + ErrorKind::Unsupported, + "Dense unions must have an offset array" + ); + }; + + Ok(View::DenseUnion(DenseUnionView { + types: array.type_ids(), + offsets, + fields, + })) + } + arrow_schema::UnionMode::Sparse => { + if array.offsets().is_some() { + fail!( + ErrorKind::Unsupported, + "Sparse unions must not have an offset array" + ); + }; + + Ok(View::SparseUnion(SparseUnionView { + types: array.type_ids(), + fields, + })) + } + } } else { fail!( ErrorKind::Unsupported, diff --git a/marrow/src/impl_arrow2/impl.rs b/marrow/src/impl_arrow2/impl.rs index 5127a70..4e15388 100644 --- a/marrow/src/impl_arrow2/impl.rs +++ b/marrow/src/impl_arrow2/impl.rs @@ -2,12 +2,12 @@ use std::borrow::Cow; use crate::{ array::{Array, PrimitiveArray}, - datatypes::{meta_from_field, DataType, Field, FieldMeta, TimeUnit, UnionMode}, + datatypes::{meta_from_field, DataType, Field, FieldMeta, IntervalUnit, TimeUnit, UnionMode}, error::{fail, ErrorKind, MarrowError, Result}, view::{ BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, - StructView, TimeView, TimestampView, View, + SparseUnionView, StructView, TimeView, TimestampView, View, }, }; @@ -45,6 +45,7 @@ impl TryFrom<&arrow2::datatypes::DataType> for DataType { AT::Time32(unit) => Ok(T::Time32((*unit).try_into()?)), AT::Time64(unit) => Ok(T::Time64((*unit).try_into()?)), AT::Duration(unit) => Ok(T::Duration((*unit).try_into()?)), + AT::Interval(unit) => Ok(T::Interval((*unit).try_into()?)), AT::Timestamp(unit, tz) => Ok(T::Timestamp((*unit).try_into()?, tz.clone())), AT::Decimal(precision, scale) => { if *precision > u8::MAX as usize || *scale > i8::MAX as usize { @@ -157,6 +158,7 @@ impl TryFrom<&DataType> for arrow2::datatypes::DataType { T::Date32 => Ok(AT::Date32), T::Date64 => Ok(AT::Date64), T::Duration(unit) => Ok(AT::Duration((*unit).try_into()?)), + T::Interval(unit) => Ok(AT::Interval((*unit).try_into()?)), T::Time32(unit) => Ok(AT::Time32((*unit).try_into()?)), T::Time64(unit) => Ok(AT::Time64((*unit).try_into()?)), T::Timestamp(unit, tz) => Ok(AT::Timestamp((*unit).try_into()?, tz.clone())), @@ -314,6 +316,32 @@ impl TryFrom for arrow2::datatypes::UnionMode { } } +/// Conversion from `arrow2` interval units modes (*requires one of the `arrow2-{version}` features*) +impl TryFrom for IntervalUnit { + type Error = MarrowError; + + fn try_from(value: arrow2::datatypes::IntervalUnit) -> Result { + match value { + arrow2::datatypes::IntervalUnit::YearMonth => Ok(IntervalUnit::YearMonth), + arrow2::datatypes::IntervalUnit::DayTime => Ok(IntervalUnit::DayTime), + arrow2::datatypes::IntervalUnit::MonthDayNano => Ok(IntervalUnit::MonthDayNano), + } + } +} + +/// Conversion to `arrow2` interval units modes (*requires one of the `arrow2-{version}` features*) +impl TryFrom for arrow2::datatypes::IntervalUnit { + type Error = MarrowError; + + fn try_from(value: IntervalUnit) -> Result { + match value { + IntervalUnit::YearMonth => Ok(arrow2::datatypes::IntervalUnit::YearMonth), + IntervalUnit::DayTime => Ok(arrow2::datatypes::IntervalUnit::DayTime), + IntervalUnit::MonthDayNano => Ok(arrow2::datatypes::IntervalUnit::MonthDayNano), + } + } +} + /// Conversion to `arrow2` arrays (*requires one of the `arrow2-{version}` features*) impl TryFrom for Box { type Error = MarrowError; @@ -445,19 +473,7 @@ impl TryFrom for Box { ))) } A::DenseUnion(arr) => { - let mut values = Vec::new(); - let mut fields = Vec::new(); - let mut type_ids = Vec::new(); - - for (type_id, meta, child) in arr.fields { - let child: Box = child.try_into()?; - let field = field_from_array_and_meta(child.as_ref(), meta); - - type_ids.push(type_id.into()); - values.push(child); - fields.push(field); - } - + let (type_ids, fields, values) = convert_union_fields(arr.fields)?; Ok(Box::new(arrow2::array::UnionArray::try_new( AT::Union(fields, Some(type_ids), arrow2::datatypes::UnionMode::Dense), arr.types.into(), @@ -465,6 +481,15 @@ impl TryFrom for Box { Some(arr.offsets.into()), )?)) } + A::SparseUnion(arr) => { + let (type_ids, fields, values) = convert_union_fields(arr.fields)?; + Ok(Box::new(arrow2::array::UnionArray::try_new( + AT::Union(fields, Some(type_ids), arrow2::datatypes::UnionMode::Sparse), + arr.types.into(), + values, + None, + )?)) + } A::FixedSizeList(arr) => { let child: Box = (*arr.elements).try_into()?; let child_field = field_from_array_and_meta(child.as_ref(), arr.meta); @@ -494,6 +519,29 @@ impl TryFrom for Box { } } +fn convert_union_fields( + union_fields: Vec<(i8, FieldMeta, Array)>, +) -> Result<( + Vec, + Vec, + Vec>, +)> { + let mut values = Vec::new(); + let mut fields = Vec::new(); + let mut type_ids = Vec::new(); + + for (type_id, meta, child) in union_fields { + let child: Box = child.try_into()?; + let field = field_from_array_and_meta(child.as_ref(), meta); + + type_ids.push(type_id.into()); + values.push(child); + fields.push(field); + } + + Ok((type_ids, fields, values)) +} + fn build_primitive_array( data_type: arrow2::datatypes::DataType, buffer: Vec, @@ -794,9 +842,7 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { array.offsets().as_slice(), )?)) } else if let Some(array) = any.downcast_ref::() { - let AT::Union(union_fields, type_ids, arrow2::datatypes::UnionMode::Dense) = - array.data_type() - else { + let AT::Union(union_fields, type_ids, mode) = array.data_type() else { fail!( ErrorKind::Unsupported, "Invalid data type: only dense unions are supported" @@ -814,12 +860,6 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { }; let types = array.types().as_slice(); - let Some(offsets) = array.offsets() else { - fail!( - ErrorKind::Unsupported, - "DenseUnion array without offsets are not supported" - ); - }; let mut fields = Vec::new(); for ((type_id, child), child_field) in @@ -832,11 +872,31 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { )); } - Ok(V::DenseUnion(DenseUnionView { - types, - offsets: offsets.as_slice(), - fields, - })) + match mode { + arrow2::datatypes::UnionMode::Dense => { + let Some(offsets) = array.offsets() else { + fail!( + ErrorKind::Unsupported, + "DenseUnion array without offsets are not supported" + ); + }; + + Ok(V::DenseUnion(DenseUnionView { + types, + offsets: offsets.as_slice(), + fields, + })) + } + arrow2::datatypes::UnionMode::Sparse => { + if array.offsets().is_some() { + fail!( + ErrorKind::Unsupported, + "SparseUnion array with offsets are not supported" + ); + }; + Ok(V::SparseUnion(SparseUnionView { types, fields })) + } + } } else if let Some(array) = any.downcast_ref::() { let AT::FixedSizeList(field, _) = array.data_type() else { fail!( diff --git a/marrow/src/view.rs b/marrow/src/view.rs index a19e371..8097359 100644 --- a/marrow/src/view.rs +++ b/marrow/src/view.rs @@ -84,6 +84,8 @@ pub enum View<'a> { Map(MapView<'a>), /// See [`Array::DenseUnion`][crate::array::Array::DenseUnion] DenseUnion(DenseUnionView<'a>), + /// See [`Array::SparseUnion`][crate::array::Array::SparseUnion] + SparseUnion(SparseUnionView<'a>), } /// A bitmap with an optional offset @@ -288,3 +290,12 @@ pub struct DenseUnionView<'a> { /// See [`DenseUnionArray::fields`][crate::array::DenseUnionArray::fields] pub fields: Vec<(i8, FieldMeta, View<'a>)>, } + +/// See [`SparseUnionArray`][crate::array::SparseUnionArray] +#[derive(Debug, Clone, PartialEq)] +pub struct SparseUnionView<'a> { + /// See [`SparseUnionArray::types`][crate::array::SparseUnionArray::types] + pub types: &'a [i8], + /// See [`SparseUnionArray::fields`][crate::array::SparseUnionArray::fields] + pub fields: Vec<(i8, FieldMeta, View<'a>)>, +} diff --git a/test_with_arrow/src/lib.rs b/test_with_arrow/src/lib.rs index a718276..17fb4f3 100644 --- a/test_with_arrow/src/lib.rs +++ b/test_with_arrow/src/lib.rs @@ -16,21 +16,21 @@ macro_rules! define_test_module { }; } -// arrow-version:insert: define_test_module!("arrow-{version}", arrow_{version}, arrow_array_{version}, arrow_schema_{version}, utils, arrays, data_types, union_arrays); -define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays, union_arrays); -define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays, union_arrays); -define_test_module!("arrow-51", arrow_51, arrow_array_51, arrow_schema_51, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays); -define_test_module!("arrow-50", arrow_50, arrow_array_50, arrow_schema_50, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays); -define_test_module!("arrow-49", arrow_49, arrow_array_49, arrow_schema_49, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays); -define_test_module!("arrow-48", arrow_48, arrow_array_48, arrow_schema_48, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays); -define_test_module!("arrow-47", arrow_47, arrow_array_47, arrow_schema_47, utils, arrays, data_types, struct_arrays, fixed_size_binary_arrays); -define_test_module!("arrow-46", arrow_46, arrow_array_46, arrow_schema_46, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-45", arrow_45, arrow_array_45, arrow_schema_45, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-44", arrow_44, arrow_array_44, arrow_schema_44, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-43", arrow_43, arrow_array_43, arrow_schema_43, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-42", arrow_42, arrow_array_42, arrow_schema_42, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-41", arrow_41, arrow_array_41, arrow_schema_41, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-40", arrow_40, arrow_array_40, arrow_schema_40, utils, arrays, data_types, struct_arrays); -define_test_module!("arrow-39", arrow_39, arrow_array_39, arrow_schema_39, utils, arrays, data_types, struct_arrays); +// arrow-version:insert: define_test_module!("arrow-{version}", arrow_{version}, arrow_array_{version}, arrow_schema_{version}, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); +define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); +define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); +define_test_module!("arrow-51", arrow_51, arrow_array_51, arrow_schema_51, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); +define_test_module!("arrow-50", arrow_50, arrow_array_50, arrow_schema_50, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); +define_test_module!("arrow-49", arrow_49, arrow_array_49, arrow_schema_49, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); +define_test_module!("arrow-48", arrow_48, arrow_array_48, arrow_schema_48, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); +define_test_module!("arrow-47", arrow_47, arrow_array_47, arrow_schema_47, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); +define_test_module!("arrow-46", arrow_46, arrow_array_46, arrow_schema_46, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-45", arrow_45, arrow_array_45, arrow_schema_45, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-44", arrow_44, arrow_array_44, arrow_schema_44, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-43", arrow_43, arrow_array_43, arrow_schema_43, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-42", arrow_42, arrow_array_42, arrow_schema_42, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-41", arrow_41, arrow_array_41, arrow_schema_41, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-40", arrow_40, arrow_array_40, arrow_schema_40, utils, arrays, data_types,struct_arrays); +define_test_module!("arrow-39", arrow_39, arrow_array_39, arrow_schema_39, utils, arrays, data_types,struct_arrays); define_test_module!("arrow-38", arrow_38, arrow_array_38, arrow_schema_38, utils, arrays, data_types); define_test_module!("arrow-37", arrow_37, arrow_array_37, arrow_schema_37, utils, arrays, data_types); diff --git a/test_with_arrow/src/tests/union_arrays.rs b/test_with_arrow/src/tests/union_arrays.rs index 0c35cdc..0cc1d5e 100644 --- a/test_with_arrow/src/tests/union_arrays.rs +++ b/test_with_arrow/src/tests/union_arrays.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use marrow::{ - array::{Array, DenseUnionArray, PrimitiveArray}, + array::{Array, DenseUnionArray, PrimitiveArray, SparseUnionArray}, datatypes::FieldMeta, }; @@ -108,3 +108,91 @@ mod dense_union_array { ) } } + +mod sparse_union_array { + use super::*; + + use arrow_array::{ArrayRef, Float64Array, Int32Array, UnionArray}; + + // Adapted from the arrow docs + // + // Source: https://github.com/apache/arrow-rs/blob/065c7b8f94264eeb6a1ca23a92795fc4e0d31d51/arrow-array/src/array/union_array.rs#L87 + // License: ../../LICENSE.arrow.txt + // Notice: ../../NOTICE.arrow.txt + // + // Original notice: + // + // Licensed to the Apache Software Foundation (ASF) under one + // or more contributor license agreements. See the NOTICE file + // distributed with this work for additional information + // regarding copyright ownership. The ASF licenses this file + // to you under the Apache License, Version 2.0 (the + // "License"); you may not use this file except in compliance + // with the License. You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, + // software distributed under the License is distributed on an + // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + // KIND, either express or implied. See the License for the + // specific language governing permissions and limitations + // under the License. + fn example_array() -> PanicOnError { + use arrow_schema::{DataType, Field}; + + let int_array = Int32Array::from(vec![Some(1), None, Some(34)]); + let float_array = Float64Array::from(vec![None, Some(3.2), None]); + let type_ids = vec![0_i8, 1, 0]; + let union_fields = vec![ + (0, Arc::new(Field::new("A", DataType::Int32, false))), + (1, Arc::new(Field::new("B", DataType::Float64, false))), + ]; + + let children = vec![Arc::new(int_array) as ArrayRef, Arc::new(float_array)]; + + let array = UnionArray::try_new( + union_fields.into_iter().collect(), + type_ids.into_iter().collect(), + None, + children, + )?; + Ok(Arc::new(array) as ArrayRef) + } + + #[test] + fn example() -> PanicOnError<()> { + assert_arrays_eq( + example_array()?, + Array::SparseUnion(SparseUnionArray { + types: vec![0, 1, 0], + fields: vec![ + ( + 0, + // NOTE: the fields are explicitly set as non-nullable + FieldMeta { + name: String::from("A"), + ..Default::default() + }, + Array::Int32(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![1, 0, 34], + }), + ), + ( + 1, + // NOTE: the fields are explicitly set as non-nullable + FieldMeta { + name: String::from("B"), + ..Default::default() + }, + Array::Float64(PrimitiveArray { + validity: Some(vec![0b_010]), + values: vec![0.0, 3.2, 0.0], + }), + ), + ], + }), + ) + } +} From 05c88177f877d09ce5a7500e3166f48ce25935ad Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 13:47:29 +0200 Subject: [PATCH 2/9] Add support for interval types --- marrow/Cargo.toml | 8 +- marrow/src/array.rs | 16 +++ marrow/src/error.rs | 17 +++ marrow/src/impl_arrow/impl_api_base.rs | 48 ++++++++ marrow/src/impl_arrow2/impl.rs | 4 + marrow/src/lib.rs | 2 + marrow/src/types.rs | 35 ++++++ marrow/src/view.rs | 7 ++ test_with_arrow/src/lib.rs | 4 +- test_with_arrow/src/tests/arrays.rs | 160 +++++++++++++++++++++++++ test_with_arrow/src/tests/intervals.rs | 45 +++++++ 11 files changed, 340 insertions(+), 6 deletions(-) create mode 100644 marrow/src/types.rs create mode 100644 test_with_arrow/src/tests/intervals.rs diff --git a/marrow/Cargo.toml b/marrow/Cargo.toml index 98f6877..9ee68d1 100644 --- a/marrow/Cargo.toml +++ b/marrow/Cargo.toml @@ -40,11 +40,13 @@ arrow-38 = ["dep:arrow-array-38", "dep:arrow-schema-38", "dep:arrow-data-38", "d arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"] # support for different arrow2 versions -arrow2-0-17 = ["dep:arrow2-0-17", "dep:bytemuck", "half/bytemuck"] -arrow2-0-16 = ["dep:arrow2-0-16", "dep:bytemuck", "half/bytemuck"] +arrow2-0-17 = ["dep:arrow2-0-17", "half/bytemuck"] +arrow2-0-16 = ["dep:arrow2-0-16", "half/bytemuck"] [dependencies] +bytemuck = { version = "1", default-features = false, features = ["derive"] } half = { version = "2", default-features = false } + serde = { version = "1.0", default-features = false, features = ["std", "derive"], optional = true } # arrow-version:insert: arrow-array-{version} = {{ package = "arrow-array", version = "{version}", optional = true, default-features = false }} @@ -125,5 +127,3 @@ arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true, d arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true, default-features = false } arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true, default-features = false } - -bytemuck = { version = "1", optional = true, default-features = false } diff --git a/marrow/src/array.rs b/marrow/src/array.rs index e562d96..2503f51 100644 --- a/marrow/src/array.rs +++ b/marrow/src/array.rs @@ -4,6 +4,7 @@ use half::f16; use crate::{ datatypes::{FieldMeta, MapMeta, TimeUnit}, error::{fail, ErrorKind, Result}, + types::{DayTimeInterval, MonthDayNanoInterval}, view::{ BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, @@ -59,6 +60,18 @@ pub enum Array { Timestamp(TimestampArray), /// An `i64` array of durations Duration(TimeArray), + /// Interval with `YearMonth` unit + /// + /// Interval arrays are not supported for `arrow2`. + YearMonthInterval(PrimitiveArray), + /// Interval with `DayTime` unit + /// + /// Interval arrays are not supported for `arrow2`. + DayTimeInterval(PrimitiveArray), + /// Interval with `MonthDayNano` unit + /// + /// Interval arrays are not supported for `arrow2`. + MonthDayNanoInterval(PrimitiveArray), /// A `[u8]` array with `i32` offsets of strings Utf8(BytesArray), /// A `[u8]` array with `i64` offsets of strings @@ -113,6 +126,9 @@ impl Array { Self::Time64(array) => View::Time64(array.as_view()), Self::Timestamp(array) => View::Timestamp(array.as_view()), Self::Duration(array) => View::Duration(array.as_view()), + Self::YearMonthInterval(array) => View::YearMonthInterval(array.as_view()), + Self::DayTimeInterval(array) => View::DayTimeInterval(array.as_view()), + Self::MonthDayNanoInterval(array) => View::MonthDayNanoInterval(array.as_view()), Self::Binary(array) => View::Binary(array.as_view()), Self::LargeBinary(array) => View::LargeBinary(array.as_view()), Self::FixedSizeBinary(array) => View::FixedSizeBinary(array.as_view()), diff --git a/marrow/src/error.rs b/marrow/src/error.rs index 2d805fa..62ec3bd 100644 --- a/marrow/src/error.rs +++ b/marrow/src/error.rs @@ -163,3 +163,20 @@ impl From for MarrowError { ) } } + +impl From for MarrowError { + fn from(err: bytemuck::PodCastError) -> Self { + let err = match err { + bytemuck::PodCastError::TargetAlignmentGreaterAndInputNotAligned => { + "TargetAlignmentGreaterAndInputNotAligned" + } + bytemuck::PodCastError::OutputSliceWouldHaveSlop => "OutputSliceWouldHaveSlop", + bytemuck::PodCastError::SizeMismatch => "SizeMismatch", + bytemuck::PodCastError::AlignmentMismatch => "AlignmentMismatch", + }; + MarrowError::new( + ErrorKind::Unsupported, + format!("bytemuck::PodCastError: {err}"), + ) + } +} diff --git a/marrow/src/impl_arrow/impl_api_base.rs b/marrow/src/impl_arrow/impl_api_base.rs index 1319150..221e7a9 100644 --- a/marrow/src/impl_arrow/impl_api_base.rs +++ b/marrow/src/impl_arrow/impl_api_base.rs @@ -353,6 +353,26 @@ fn build_array_data(value: Array) -> Result { arr.validity, arr.values, ), + A::YearMonthInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), + arr.validity, + arr.values, + ), + A::DayTimeInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), + arr.validity, + // NOTE: bytemuck::allocation::try_cast_vec enforces exact alignment. This cannot be + // guaranteed between different arrow version (arrow < 52 used i64, arrow >= 52 has its + // own type with different alignment). Therefore covert the vector elementwise and + // create a new vector. + try_cast_vec::<_, i64>(arr.values)?, + ), + A::MonthDayNanoInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), + arr.validity, + // See note for A::DayTimeInterval + try_cast_vec::<_, i128>(arr.values)?, + ), A::Decimal128(arr) => primitive_into_data( arrow_schema::DataType::Decimal128(arr.precision, arr.scale), arr.validity, @@ -725,6 +745,26 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { validity: get_bits_with_offset(array), values: array.values(), })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::YearMonthInterval(PrimitiveView { + validity: get_bits_with_offset(array), + values: array.values(), + })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::DayTimeInterval(PrimitiveView { + validity: get_bits_with_offset(array), + // bytemuck checks the dynamically. This check always succeeds if the the target + // alignment is smaller or equal to the source alignment. This is the case here, as + // structs are aligned to their largest field (which is at most 64 bits) and arrow + // aligns to 64 bits. + values: bytemuck::try_cast_slice(array.values().inner().as_slice())?, + })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::MonthDayNanoInterval(PrimitiveView { + validity: get_bits_with_offset(array), + // See note for DayTimeInterval + values: bytemuck::try_cast_slice(array.values().inner().as_slice())?, + })) } else if let Some(array) = any.downcast_ref::() { Ok(View::Utf8(BytesView { validity: get_bits_with_offset(array), @@ -987,3 +1027,11 @@ fn get_bits_with_offset(array: &dyn arrow_array::Array) -> Option(a: Vec) -> Result> { + let mut res = Vec::new(); + for item in a { + res.push(bytemuck::try_cast(item)?); + } + Ok(res) +} diff --git a/marrow/src/impl_arrow2/impl.rs b/marrow/src/impl_arrow2/impl.rs index 4e15388..d8e701d 100644 --- a/marrow/src/impl_arrow2/impl.rs +++ b/marrow/src/impl_arrow2/impl.rs @@ -515,6 +515,10 @@ impl TryFrom for Box { validity, )?)) } + A::YearMonthInterval(_) | A::DayTimeInterval(_) | A::MonthDayNanoInterval(_) => fail!( + ErrorKind::Unsupported, + "Interval arrays are not supported for arrow2" + ), } } } diff --git a/marrow/src/lib.rs b/marrow/src/lib.rs index 9f777f8..b46ec87 100644 --- a/marrow/src/lib.rs +++ b/marrow/src/lib.rs @@ -128,6 +128,8 @@ pub mod datatypes; #[deny(missing_docs)] pub mod error; #[deny(missing_docs)] +pub mod types; +#[deny(missing_docs)] pub mod view; mod impl_arrow; diff --git a/marrow/src/types.rs b/marrow/src/types.rs new file mode 100644 index 0000000..ec64d1e --- /dev/null +++ b/marrow/src/types.rs @@ -0,0 +1,35 @@ +//! Specialized element types of arrays + +/// Represent a calendar interval as days and milliseconds +#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)] +#[repr(C)] +pub struct DayTimeInterval { + /// The number of days in the interval + pub days: i32, + /// The number of milliseconds in the interval + pub milliseconds: i32, +} + +/// Represent a calendar interval as months, days and nanoseconds +#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)] +#[repr(C)] +pub struct MonthDayNanoInterval { + /// The number of months in the interval + pub months: i32, + /// The number of days in the interval + pub days: i32, + /// The number of nanoseconds in the interval + pub nanoseconds: i64, +} + +#[test] +fn interval_sizes() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::() + ); + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::() + ); +} diff --git a/marrow/src/view.rs b/marrow/src/view.rs index 8097359..5a01cc0 100644 --- a/marrow/src/view.rs +++ b/marrow/src/view.rs @@ -6,6 +6,7 @@ use half::f16; use crate::{ datatypes::{FieldMeta, MapMeta, TimeUnit}, error::{fail, ErrorKind, Result}, + types::{DayTimeInterval, MonthDayNanoInterval}, }; // assert that the `Array` implements the expected traits @@ -58,6 +59,12 @@ pub enum View<'a> { Timestamp(TimestampView<'a>), /// See [`Array::Duration`][crate::array::Array::Duration] Duration(TimeView<'a, i64>), + /// See [`Array::YearMonthInterval`][crate::array::Array::YearMonthInterval] + YearMonthInterval(PrimitiveView<'a, i32>), + /// See [`Array::DayTimeInterval`][crate::array::Array::DayTimeInterval] + DayTimeInterval(PrimitiveView<'a, DayTimeInterval>), + /// See [`Array::MonthDayNanoInterval`][crate::array::Array::MonthDayNanoInterval] + MonthDayNanoInterval(PrimitiveView<'a, MonthDayNanoInterval>), /// See [`Array::Utf8`][crate::array::Array::Utf8] Utf8(BytesView<'a, i32>), /// See [`Array::LargeUtf8`][crate::array::Array::LargeUtf8] diff --git a/test_with_arrow/src/lib.rs b/test_with_arrow/src/lib.rs index 17fb4f3..09e92de 100644 --- a/test_with_arrow/src/lib.rs +++ b/test_with_arrow/src/lib.rs @@ -17,8 +17,8 @@ macro_rules! define_test_module { } // arrow-version:insert: define_test_module!("arrow-{version}", arrow_{version}, arrow_array_{version}, arrow_schema_{version}, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); -define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); -define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); +define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); +define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); define_test_module!("arrow-51", arrow_51, arrow_array_51, arrow_schema_51, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); define_test_module!("arrow-50", arrow_50, arrow_array_50, arrow_schema_50, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); define_test_module!("arrow-49", arrow_49, arrow_array_49, arrow_schema_49, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); diff --git a/test_with_arrow/src/tests/arrays.rs b/test_with_arrow/src/tests/arrays.rs index 155fd15..ccb6479 100644 --- a/test_with_arrow/src/tests/arrays.rs +++ b/test_with_arrow/src/tests/arrays.rs @@ -752,6 +752,166 @@ mod duration_nanosecond { } } +mod interval_year_month { + use super::*; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![1, 2, 3]), + Array::YearMonthInterval(PrimitiveArray { + validity: None, + values: vec![1, 2, 3], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![Some(1), None, Some(3)]), + Array::YearMonthInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![1, 0, 3], + }), + ) + } +} + +mod internval_day_time { + use super::*; + + use marrow::types::DayTimeInterval; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalDayTimeType::make_value(1, 2), + arrow_array::types::IntervalDayTimeType::make_value(3, 4), + arrow_array::types::IntervalDayTimeType::make_value(5, 6), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: None, + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 3, + milliseconds: 4, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalDayTimeType::make_value(1, 2)), + None, + Some(arrow_array::types::IntervalDayTimeType::make_value(5, 6)), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 0, + milliseconds: 0, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } +} + +mod interval_month_day_nano { + use super::*; + + use marrow::types::MonthDayNanoInterval; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalMonthDayNanoType::make_value(1, 2, 3), + arrow_array::types::IntervalMonthDayNanoType::make_value(4, 5, 6), + arrow_array::types::IntervalMonthDayNanoType::make_value(7, 8, 9), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: None, + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 4, + days: 5, + nanoseconds: 6, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 1, 2, 3, + )), + None, + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 7, 8, 9, + )), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 0, + days: 0, + nanoseconds: 0, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } +} + mod timestamp_second { use super::*; diff --git a/test_with_arrow/src/tests/intervals.rs b/test_with_arrow/src/tests/intervals.rs new file mode 100644 index 0000000..64e8e27 --- /dev/null +++ b/test_with_arrow/src/tests/intervals.rs @@ -0,0 +1,45 @@ +// check the layout of the interval types + +#[test] +fn interval_layout_day_time() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::(), + ); + assert_eq!( + std::mem::align_of::(), + std::mem::align_of::(), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalDayTime, days), + std::mem::offset_of!(marrow::types::DayTimeInterval, days), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalDayTime, milliseconds), + std::mem::offset_of!(marrow::types::DayTimeInterval, milliseconds), + ); +} + +#[test] +fn interval_layout_month_day_nano() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::(), + ); + assert_eq!( + std::mem::align_of::(), + std::mem::align_of::(), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, months), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, months), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, days), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, days), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, nanoseconds), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, nanoseconds), + ); +} From 95decb12f2d2d86543bbabea5c5e499e35b8c910 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 14:00:44 +0200 Subject: [PATCH 3/9] Include system info in workflow --- .github/workflows/release.yml | 4 ++++ .github/workflows/test.yml | 4 ++++ x.py | 7 +++---- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 83365c1..ff205f8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,6 +20,10 @@ { "uses": "actions/checkout@v4" }, + { + "name": "system", + "run": "uname - a" + }, { "name": "rustc", "run": "rustc --version" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0b28fad..4e76b61 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,6 +26,10 @@ { "uses": "actions/checkout@v4" }, + { + "name": "system", + "run": "uname - a" + }, { "name": "rustc", "run": "rustc --version" diff --git a/x.py b/x.py index 8b7569d..9541bc6 100644 --- a/x.py +++ b/x.py @@ -50,8 +50,6 @@ "runs-on": "ubuntu-latest", "steps": [ {"uses": "actions/checkout@v4"}, - {"name": "rustc", "run": "rustc --version"}, - {"name": "cargo", "run": "cargo --version"}, *_workflow_check_steps(), ], } @@ -72,8 +70,6 @@ }, "steps": [ {"uses": "actions/checkout@v4"}, - {"name": "rustc", "run": "rustc --version"}, - {"name": "cargo", "run": "cargo --version"}, *_workflow_check_steps(), { "name": "Publish to crates.io", @@ -119,6 +115,9 @@ def _update_json_file(path, content): def _workflow_check_steps(): return [ + {"name": "system", "run": "uname - a"}, + {"name": "rustc", "run": "rustc --version"}, + {"name": "cargo", "run": "cargo --version"}, { "name": "Check format", "run": "cargo fmt --check", From 40de057818cfb47d1a51b1b0d2f0977c10110637 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 14:01:59 +0200 Subject: [PATCH 4/9] fix typo in workflow --- .github/workflows/release.yml | 2 +- .github/workflows/test.yml | 2 +- x.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ff205f8..e8bb5cb 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,7 +22,7 @@ }, { "name": "system", - "run": "uname - a" + "run": "uname -a" }, { "name": "rustc", diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4e76b61..8aacf97 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,7 +28,7 @@ }, { "name": "system", - "run": "uname - a" + "run": "uname -a" }, { "name": "rustc", diff --git a/x.py b/x.py index 9541bc6..1d2cdeb 100644 --- a/x.py +++ b/x.py @@ -115,7 +115,7 @@ def _update_json_file(path, content): def _workflow_check_steps(): return [ - {"name": "system", "run": "uname - a"}, + {"name": "system", "run": "uname -a"}, {"name": "rustc", "run": "rustc --version"}, {"name": "cargo", "run": "cargo --version"}, { From f6da75398b76a596f0ece7e90dbfa48f2f72c90d Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 14:20:15 +0200 Subject: [PATCH 5/9] Fix clippy complex_type --- marrow/src/impl_arrow/impl_api_base.rs | 1 + marrow/src/impl_arrow2/impl.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/marrow/src/impl_arrow/impl_api_base.rs b/marrow/src/impl_arrow/impl_api_base.rs index 221e7a9..b6fd240 100644 --- a/marrow/src/impl_arrow/impl_api_base.rs +++ b/marrow/src/impl_arrow/impl_api_base.rs @@ -547,6 +547,7 @@ fn build_array_data(value: Array) -> Result { } } +#[allow(clippy::type_complexity)] fn union_fields_into_fields_and_data( union_fields: Vec<(i8, FieldMeta, Array)>, ) -> Result<( diff --git a/marrow/src/impl_arrow2/impl.rs b/marrow/src/impl_arrow2/impl.rs index d8e701d..89eeeb8 100644 --- a/marrow/src/impl_arrow2/impl.rs +++ b/marrow/src/impl_arrow2/impl.rs @@ -523,6 +523,7 @@ impl TryFrom for Box { } } +#[allow(clippy::type_complexity)] fn convert_union_fields( union_fields: Vec<(i8, FieldMeta, Array)>, ) -> Result<( From 8ca66e9100c8ae737f180b91e886185cc059c237 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 14:20:38 +0200 Subject: [PATCH 6/9] Test interval arrays only for `arrow>=52` --- test_with_arrow/src/tests/arrays.rs | 134 ----------------------- test_with_arrow/src/tests/intervals.rs | 142 +++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 134 deletions(-) diff --git a/test_with_arrow/src/tests/arrays.rs b/test_with_arrow/src/tests/arrays.rs index ccb6479..570ef2e 100644 --- a/test_with_arrow/src/tests/arrays.rs +++ b/test_with_arrow/src/tests/arrays.rs @@ -778,140 +778,6 @@ mod interval_year_month { } } -mod internval_day_time { - use super::*; - - use marrow::types::DayTimeInterval; - - #[test] - fn not_nullable() -> PanicOnError<()> { - assert_arrays_eq( - as_array_ref::(vec![ - arrow_array::types::IntervalDayTimeType::make_value(1, 2), - arrow_array::types::IntervalDayTimeType::make_value(3, 4), - arrow_array::types::IntervalDayTimeType::make_value(5, 6), - ]), - Array::DayTimeInterval(PrimitiveArray { - validity: None, - values: vec![ - DayTimeInterval { - days: 1, - milliseconds: 2, - }, - DayTimeInterval { - days: 3, - milliseconds: 4, - }, - DayTimeInterval { - days: 5, - milliseconds: 6, - }, - ], - }), - ) - } - - #[test] - fn nullable() -> PanicOnError<()> { - assert_arrays_eq( - as_array_ref::(vec![ - Some(arrow_array::types::IntervalDayTimeType::make_value(1, 2)), - None, - Some(arrow_array::types::IntervalDayTimeType::make_value(5, 6)), - ]), - Array::DayTimeInterval(PrimitiveArray { - validity: Some(vec![0b_101]), - values: vec![ - DayTimeInterval { - days: 1, - milliseconds: 2, - }, - DayTimeInterval { - days: 0, - milliseconds: 0, - }, - DayTimeInterval { - days: 5, - milliseconds: 6, - }, - ], - }), - ) - } -} - -mod interval_month_day_nano { - use super::*; - - use marrow::types::MonthDayNanoInterval; - - #[test] - fn not_nullable() -> PanicOnError<()> { - assert_arrays_eq( - as_array_ref::(vec![ - arrow_array::types::IntervalMonthDayNanoType::make_value(1, 2, 3), - arrow_array::types::IntervalMonthDayNanoType::make_value(4, 5, 6), - arrow_array::types::IntervalMonthDayNanoType::make_value(7, 8, 9), - ]), - Array::MonthDayNanoInterval(PrimitiveArray { - validity: None, - values: vec![ - MonthDayNanoInterval { - months: 1, - days: 2, - nanoseconds: 3, - }, - MonthDayNanoInterval { - months: 4, - days: 5, - nanoseconds: 6, - }, - MonthDayNanoInterval { - months: 7, - days: 8, - nanoseconds: 9, - }, - ], - }), - ) - } - - #[test] - fn nullable() -> PanicOnError<()> { - assert_arrays_eq( - as_array_ref::(vec![ - Some(arrow_array::types::IntervalMonthDayNanoType::make_value( - 1, 2, 3, - )), - None, - Some(arrow_array::types::IntervalMonthDayNanoType::make_value( - 7, 8, 9, - )), - ]), - Array::MonthDayNanoInterval(PrimitiveArray { - validity: Some(vec![0b_101]), - values: vec![ - MonthDayNanoInterval { - months: 1, - days: 2, - nanoseconds: 3, - }, - MonthDayNanoInterval { - months: 0, - days: 0, - nanoseconds: 0, - }, - MonthDayNanoInterval { - months: 7, - days: 8, - nanoseconds: 9, - }, - ], - }), - ) - } -} - mod timestamp_second { use super::*; diff --git a/test_with_arrow/src/tests/intervals.rs b/test_with_arrow/src/tests/intervals.rs index 64e8e27..eb6999e 100644 --- a/test_with_arrow/src/tests/intervals.rs +++ b/test_with_arrow/src/tests/intervals.rs @@ -43,3 +43,145 @@ fn interval_layout_month_day_nano() { std::mem::offset_of!(marrow::types::MonthDayNanoInterval, nanoseconds), ); } + +// NOTE: the arrow impl of `make_value` is incorrect before `arrow=52`, only test for `arrow>=52` +mod internval_day_time { + use super::super::utils::{as_array_ref, assert_arrays_eq, PanicOnError}; + use super::arrow_array; + use marrow::{ + array::{Array, PrimitiveArray}, + types::DayTimeInterval, + }; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalDayTimeType::make_value(1, 2), + arrow_array::types::IntervalDayTimeType::make_value(3, 4), + arrow_array::types::IntervalDayTimeType::make_value(5, 6), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: None, + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 3, + milliseconds: 4, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalDayTimeType::make_value(1, 2)), + None, + Some(arrow_array::types::IntervalDayTimeType::make_value(5, 6)), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 0, + milliseconds: 0, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } +} + +// NOTE: the arrow impl of `make_value` is incorrect before `arrow=52`, only test for `arrow>=52` +mod interval_month_day_nano { + use super::super::utils::{as_array_ref, assert_arrays_eq, PanicOnError}; + use super::arrow_array; + use marrow::{ + array::{Array, PrimitiveArray}, + types::MonthDayNanoInterval, + }; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalMonthDayNanoType::make_value(1, 2, 3), + arrow_array::types::IntervalMonthDayNanoType::make_value(4, 5, 6), + arrow_array::types::IntervalMonthDayNanoType::make_value(7, 8, 9), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: None, + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 4, + days: 5, + nanoseconds: 6, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 1, 2, 3, + )), + None, + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 7, 8, 9, + )), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 0, + days: 0, + nanoseconds: 0, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } +} From c0c68a993ac310d436beaac085320a0fecd7472c Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Mon, 14 Oct 2024 19:24:52 +0200 Subject: [PATCH 7/9] Add `Array::data_type`, Add `RunEndEncoded`, Rework `Union`, Dictionary` --- Changes.md | 10 +- marrow/src/array.rs | 163 ++++++++++++++---- marrow/src/datatypes.rs | 59 ++++++- marrow/src/impl_arrow/impl_api_base.rs | 198 +++++++++++++++++----- marrow/src/impl_arrow2/impl.rs | 108 ++++++------ marrow/src/view.rs | 139 ++++++++++++--- test_with_arrow/src/tests/arrays.rs | 92 ++++++++-- test_with_arrow/src/tests/union_arrays.rs | 17 +- test_with_arrow/src/tests/utils.rs | 26 ++- 9 files changed, 625 insertions(+), 187 deletions(-) diff --git a/Changes.md b/Changes.md index 7b3968d..4aff016 100644 --- a/Changes.md +++ b/Changes.md @@ -4,12 +4,18 @@ - Rework map arrays to use explicit keys and values array to simplify interaction the underlying arrays -- Rework `StructArray` and `DenseUnionArray`: place metadata in front of arrays in - `StructArray::fields`, `DenseUnionArray::fields` +- Implement sparse unions, rename `DenseUnion` to `Union` and change offsets to be `Option>` +- Implement interval arrays and the `Interval` data type +- Implement run encoded array +- Rename `Dictionary::indices` to `Dictionary::keys` +- Rework `StructArray` and `UnionArray`: place metadata in front of arrays in `StructArray::fields`, + `UnionArray::fields` - Add `MarrowError::new` and `MarrowError::with_cause` - Add `as_view` for `Array` and the array structs - Implement `PartialEq` for `Array` and `View`, and `FieldMeta` - Implement `Default` for `Field` and `FieldMeta` +- Remove the sorted flag from the dictionary `DataType` it is not supported by `arrow` +- Add `Array::data_type()` and `View::data_type()` ## 0.1.0 diff --git a/marrow/src/array.rs b/marrow/src/array.rs index 2503f51..fe2749d 100644 --- a/marrow/src/array.rs +++ b/marrow/src/array.rs @@ -2,13 +2,16 @@ use half::f16; use crate::{ - datatypes::{FieldMeta, MapMeta, TimeUnit}, + datatypes::{ + field_from_meta, DataType, Field, FieldMeta, IntervalUnit, MapMeta, RunEndEncodedMeta, + TimeUnit, UnionMode, + }, error::{fail, ErrorKind, Result}, types::{DayTimeInterval, MonthDayNanoInterval}, view::{ - BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, - FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, - SparseUnionView, StructView, TimeView, TimestampView, View, + BitsWithOffset, BooleanView, BytesView, DecimalView, DictionaryView, FixedSizeBinaryView, + FixedSizeListView, ListView, MapView, NullView, PrimitiveView, RunEndEncodedView, + StructView, TimeView, TimestampView, UnionView, View, }, }; @@ -94,15 +97,107 @@ pub enum Array { FixedSizeList(FixedSizeListArray), /// An array of dictionaries Dictionary(DictionaryArray), + /// An array of run end encoded values + RunEndEncoded(RunEndEncodedArray), /// An array of maps Map(MapArray), - /// An array of unions with compact memory layout - DenseUnion(DenseUnionArray), /// An array of unions - SparseUnion(SparseUnionArray), + Union(UnionArray), } impl Array { + /// Get the data type of this array + pub fn data_type(&self) -> DataType { + use DataType as T; + match self { + Self::Null(_) => T::Null, + Self::Boolean(_) => T::Boolean, + Self::Int8(_) => T::Int8, + Self::Int16(_) => T::Int16, + Self::Int32(_) => T::Int32, + Self::Int64(_) => T::Int64, + Self::UInt8(_) => T::UInt8, + Self::UInt16(_) => T::UInt16, + Self::UInt32(_) => T::UInt32, + Self::UInt64(_) => T::UInt64, + Self::Float16(_) => T::Float16, + Self::Float32(_) => T::Float32, + Self::Float64(_) => T::Float64, + Self::Decimal128(arr) => T::Decimal128(arr.precision, arr.scale), + Self::Date32(_) => T::Date32, + Self::Date64(_) => T::Date64, + Self::Time32(arr) => T::Time32(arr.unit), + Self::Time64(arr) => T::Time64(arr.unit), + Self::Timestamp(arr) => T::Timestamp(arr.unit, arr.timezone.clone()), + Self::Duration(arr) => T::Duration(arr.unit), + Self::DayTimeInterval(_) => T::Interval(IntervalUnit::DayTime), + Self::YearMonthInterval(_) => T::Interval(IntervalUnit::YearMonth), + Self::MonthDayNanoInterval(_) => T::Interval(IntervalUnit::MonthDayNano), + Self::Binary(_) => T::Binary, + Self::LargeBinary(_) => T::LargeBinary, + Self::FixedSizeBinary(arr) => T::FixedSizeBinary(arr.n), + Self::Utf8(_) => T::Utf8, + Self::LargeUtf8(_) => T::LargeUtf8, + Self::Dictionary(arr) => T::Dictionary( + Box::new(arr.keys.data_type()), + Box::new(arr.values.data_type()), + ), + Self::List(arr) => T::List(Box::new(field_from_meta( + arr.elements.data_type(), + arr.meta.clone(), + ))), + Self::LargeList(arr) => T::LargeList(Box::new(field_from_meta( + arr.elements.data_type(), + arr.meta.clone(), + ))), + Self::FixedSizeList(arr) => T::FixedSizeList( + Box::new(field_from_meta(arr.elements.data_type(), arr.meta.clone())), + arr.n, + ), + Self::Struct(arr) => T::Struct( + arr.fields + .iter() + .map(|(meta, field)| field_from_meta(field.data_type(), meta.clone())) + .collect(), + ), + Self::Union(arr) => T::Union( + arr.fields + .iter() + .map(|(type_id, meta, field)| { + (*type_id, field_from_meta(field.data_type(), meta.clone())) + }) + .collect(), + match arr.offsets { + Some(_) => UnionMode::Dense, + None => UnionMode::Sparse, + }, + ), + Self::Map(arr) => T::Map( + Box::new(Field { + name: arr.meta.entries_name.clone(), + data_type: DataType::Struct(vec![ + field_from_meta(arr.keys.data_type(), arr.meta.keys.clone()), + field_from_meta(arr.values.data_type(), arr.meta.values.clone()), + ]), + ..Field::default() + }), + arr.meta.sorted, + ), + Self::RunEndEncoded(arr) => T::RunEndEncoded( + Box::new(Field { + name: arr.meta.run_ends_name.clone(), + data_type: arr.run_ends.data_type(), + nullable: false, + metadata: Default::default(), + }), + Box::new(field_from_meta( + arr.values.data_type(), + arr.meta.values.clone(), + )), + ), + } + } + /// Get the view for this array pub fn as_view(&self) -> View<'_> { match self { @@ -140,8 +235,8 @@ impl Array { Self::Struct(array) => View::Struct(array.as_view()), Self::Map(array) => View::Map(array.as_view()), Self::Dictionary(array) => View::Dictionary(array.as_view()), - Self::DenseUnion(array) => View::DenseUnion(array.as_view()), - Self::SparseUnion(array) => View::SparseUnion(array.as_view()), + Self::RunEndEncoded(array) => View::RunEndEncoded(array.as_view()), + Self::Union(array) => View::Union(array.as_view()), } } } @@ -502,7 +597,7 @@ impl DecimalArray { #[derive(Clone, Debug, PartialEq)] pub struct DictionaryArray { /// The indices into the values array for each element - pub indices: Box, + pub keys: Box, /// The possible values of elements pub values: Box, } @@ -511,34 +606,34 @@ impl DictionaryArray { /// Get the view for this array pub fn as_view(&self) -> DictionaryView<'_> { DictionaryView { - indices: Box::new(self.indices.as_view()), + keys: Box::new(self.keys.as_view()), values: Box::new(self.values.as_view()), } } } -/// A union of different data types with a compact representation +/// A union of different data types /// /// This corresponds roughly to Rust's enums. Each element has a type, which indicates the /// underlying array to use. For fast lookups the offsets into the underlying arrays are stored as /// well. For element `ì`, the value can be looked up by the pseudo code /// `fields[types[i]].1[offsets[i]]`. #[derive(Clone, Debug, PartialEq)] -pub struct DenseUnionArray { +pub struct UnionArray { /// The type of each element pub types: Vec, /// The offset into the underlying arrays - pub offsets: Vec, + pub offsets: Option>, /// The arrays with their metadata pub fields: Vec<(i8, FieldMeta, Array)>, } -impl DenseUnionArray { +impl UnionArray { /// Get the view for this array - pub fn as_view(&self) -> DenseUnionView<'_> { - DenseUnionView { + pub fn as_view(&self) -> UnionView<'_> { + UnionView { types: &self.types, - offsets: &self.offsets, + offsets: self.offsets.as_deref(), fields: self .fields .iter() @@ -548,26 +643,24 @@ impl DenseUnionArray { } } -/// A union of different data types with a less compact representation -/// -#[derive(Debug, Clone, PartialEq)] -pub struct SparseUnionArray { - /// The types of each element - pub types: Vec, - /// The arrays with their metadata - pub fields: Vec<(i8, FieldMeta, Array)>, +/// An array with runs of deduplicated values +#[derive(Clone, Debug, PartialEq)] +pub struct RunEndEncodedArray { + /// The metadata for the arrays + pub meta: RunEndEncodedMeta, + /// The run ends for each value + pub run_ends: Box, + /// The possible values of elements + pub values: Box, } -impl SparseUnionArray { +impl RunEndEncodedArray { /// Get the view for this array - pub fn as_view(&self) -> SparseUnionView<'_> { - SparseUnionView { - types: &self.types, - fields: self - .fields - .iter() - .map(|(type_id, meta, array)| (*type_id, meta.clone(), array.as_view())) - .collect(), + pub fn as_view(&self) -> RunEndEncodedView<'_> { + RunEndEncodedView { + meta: self.meta.clone(), + run_ends: Box::new(self.run_ends.as_view()), + values: Box::new(self.values.as_view()), } } } diff --git a/marrow/src/datatypes.rs b/marrow/src/datatypes.rs index 1d4ca33..bfe2957 100644 --- a/marrow/src/datatypes.rs +++ b/marrow/src/datatypes.rs @@ -9,11 +9,13 @@ const _: () = { impl AssertExpectedTraits for DataType {} }; -// assert that the `Field` and `FieldMeta` implement the expected traits +// assert that the `Field`, `FieldMeta`, etc. implement the expected traits const _: () = { trait AssertExpectedTraits: Clone + std::fmt::Debug + Default + PartialEq + Send + Sync {} impl AssertExpectedTraits for Field {} impl AssertExpectedTraits for FieldMeta {} + impl AssertExpectedTraits for MapMeta {} + impl AssertExpectedTraits for RunEndEncodedMeta {} }; /// The data type and metadata of a field @@ -61,6 +63,15 @@ pub(crate) fn meta_from_field(field: Field) -> FieldMeta { } } +pub(crate) fn field_from_meta(data_type: DataType, meta: FieldMeta) -> Field { + Field { + data_type, + name: meta.name, + nullable: meta.nullable, + metadata: meta.metadata, + } +} + /// Metadata for map arrays /// /// ```rust @@ -116,6 +127,44 @@ impl std::default::Default for MapMeta { } } +/// Metadata for run end encoded arrays +/// +/// ```rust +/// # use marrow::datatypes::{FieldMeta, RunEndEncodedMeta}; +/// assert_eq!( +/// RunEndEncodedMeta::default(), +/// RunEndEncodedMeta { +/// run_ends_name: String::from("run_ends"), +/// values: FieldMeta { +/// name: String::from("values"), +/// nullable: true, +/// ..FieldMeta::default() +/// }, +/// }, +/// ); +/// ``` +/// +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct RunEndEncodedMeta { + /// The name for the run ends (defaults to `"run_ends"`) + pub run_ends_name: String, + /// The metadata for the values array (defaults to a nullable with with name `"values"`) + pub values: FieldMeta, +} + +impl std::default::Default for RunEndEncodedMeta { + fn default() -> Self { + RunEndEncodedMeta { + run_ends_name: String::from("run_ends"), + values: FieldMeta { + name: String::from("values"), + nullable: true, + metadata: HashMap::new(), + }, + } + } +} + /// Data types of array /// #[cfg_attr( @@ -192,7 +241,13 @@ pub enum DataType { /// Maps Map(Box, bool), /// Deduplicated values - Dictionary(Box, Box, bool), + /// + /// The children are `Dictionary(indices, values)` + Dictionary(Box, Box), + /// Deduplicated values that continuously repeat + /// + /// The children are `RunEndEncoded(indices, values)` + RunEndEncoded(Box, Box), /// Union o different types Union(Vec<(i8, Field)>, UnionMode), } diff --git a/marrow/src/impl_arrow/impl_api_base.rs b/marrow/src/impl_arrow/impl_api_base.rs index b6fd240..ca2a42a 100644 --- a/marrow/src/impl_arrow/impl_api_base.rs +++ b/marrow/src/impl_arrow/impl_api_base.rs @@ -5,12 +5,15 @@ use half::f16; use crate::{ array::Array, - datatypes::{meta_from_field, DataType, Field, FieldMeta, IntervalUnit, TimeUnit, UnionMode}, + datatypes::{ + meta_from_field, DataType, Field, FieldMeta, IntervalUnit, RunEndEncodedMeta, TimeUnit, + UnionMode, + }, error::{fail, ErrorKind, MarrowError, Result}, view::{ - BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, - FixedSizeListView, ListView, MapView, NullView, PrimitiveView, SparseUnionView, StructView, - TimeView, TimestampView, View, + BitsWithOffset, BooleanView, BytesView, DecimalView, DictionaryView, FixedSizeListView, + ListView, MapView, NullView, PrimitiveView, RunEndEncodedView, StructView, TimeView, + TimestampView, UnionView, View, }, }; @@ -74,7 +77,6 @@ impl TryFrom<&arrow_schema::DataType> for DataType { AT::Dictionary(key, value) => Ok(T::Dictionary( T::try_from(key.as_ref())?.into(), T::try_from(value.as_ref())?.into(), - false, )), AT::Union(in_fields, mode) => { let mut fields = Vec::new(); @@ -83,6 +85,10 @@ impl TryFrom<&arrow_schema::DataType> for DataType { } Ok(T::Union(fields, (*mode).try_into()?)) } + AT::RunEndEncoded(keys, values) => Ok(T::RunEndEncoded( + Box::new(keys.as_ref().try_into()?), + Box::new(values.as_ref().try_into()?), + )), data_type => fail!( ErrorKind::Unsupported, "Unsupported arrow data type {data_type}" @@ -154,10 +160,14 @@ impl TryFrom<&DataType> for arrow_schema::DataType { } Ok(AT::Struct(fields.into())) } - T::Dictionary(key, value, _sorted) => Ok(AT::Dictionary( + T::Dictionary(key, value) => Ok(AT::Dictionary( AT::try_from(key.as_ref())?.into(), AT::try_from(value.as_ref())?.into(), )), + T::RunEndEncoded(indices, values) => Ok(AT::RunEndEncoded( + AF::try_from(indices.as_ref())?.into(), + AF::try_from(values.as_ref())?.into(), + )), T::Union(in_fields, mode) => { let mut fields = Vec::new(); for (type_id, field) in in_fields { @@ -480,19 +490,44 @@ fn build_array_data(value: Array) -> Result { )?) } A::Dictionary(arr) => { - let indices = build_array_data(*arr.indices)?; + let keys = build_array_data(*arr.keys)?; let values = build_array_data(*arr.values)?; let data_type = arrow_schema::DataType::Dictionary( - Box::new(indices.data_type().clone()), + Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), ); - Ok(indices + Ok(keys .into_builder() .data_type(data_type) .child_data(vec![values]) .build()?) } + A::RunEndEncoded(arr) => { + let len = get_ree_len_from_indices(&arr.run_ends)?; + let run_ends = build_array_data(*arr.run_ends)?; + let values = build_array_data(*arr.values)?; + let data_type = arrow_schema::DataType::RunEndEncoded( + field_from_data_and_meta( + &run_ends, + FieldMeta { + name: arr.meta.run_ends_name, + ..FieldMeta::default() + }, + ) + .into(), + field_from_data_and_meta(&values, arr.meta.values).into(), + ); + + Ok(arrow_data::ArrayData::try_new( + data_type, + len, + None, + 0, + vec![], + vec![run_ends, values], + )?) + } A::Map(arr) => { let (entries, entries_name, sorted, validity, offsets) = arr.into_logical_array()?; let entries = build_array_data(entries)?; @@ -513,40 +548,50 @@ fn build_array_data(value: Array) -> Result { vec![entries], )?) } - A::DenseUnion(arr) => { - let (fields, child_data) = union_fields_into_fields_and_data(arr.fields)?; - Ok(arrow_data::ArrayData::try_new( - arrow_schema::DataType::Union( - fields.into_iter().collect(), - arrow_schema::UnionMode::Dense, - ), - arr.types.len(), - None, - 0, - vec![ - arrow_buffer::ScalarBuffer::from(arr.types).into_inner(), - arrow_buffer::ScalarBuffer::from(arr.offsets).into_inner(), - ], - child_data, - )?) - } - A::SparseUnion(arr) => { + A::Union(arr) => { let (fields, child_data) = union_fields_into_fields_and_data(arr.fields)?; + let len = arr.types.len(); + let mut buffers = vec![arrow_buffer::ScalarBuffer::from(arr.types).into_inner()]; + let mode; + + if let Some(offsets) = arr.offsets { + buffers.push(arrow_buffer::ScalarBuffer::from(offsets).into_inner()); + mode = arrow_schema::UnionMode::Dense; + } else { + mode = arrow_schema::UnionMode::Sparse; + } + Ok(arrow_data::ArrayData::try_new( - arrow_schema::DataType::Union( - fields.into_iter().collect(), - arrow_schema::UnionMode::Sparse, - ), - arr.types.len(), + arrow_schema::DataType::Union(fields.into_iter().collect(), mode), + len, None, 0, - vec![arrow_buffer::ScalarBuffer::from(arr.types).into_inner()], + buffers, child_data, )?) } } } +fn get_ree_len_from_indices(indices: &Array) -> Result { + let cand = match indices { + Array::Int16(array) => array.values.last().copied().map(usize::try_from), + Array::Int32(array) => array.values.last().copied().map(usize::try_from), + Array::Int64(array) => array.values.last().copied().map(usize::try_from), + // TODO: include data type + _ => fail!( + ErrorKind::Unsupported, + "unsupported run ends in RunEndEncoded" + ), + }; + + match cand { + Some(Ok(len)) => Ok(len), + Some(Err(err)) => Err(err.into()), + None => Ok(0), + } +} + #[allow(clippy::type_complexity)] fn union_fields_into_fields_and_data( union_fields: Vec<(i8, FieldMeta, Array)>, @@ -891,6 +936,30 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { wrap_dictionary_array::(array) } else if let Some(array) = any.downcast_ref::() { wrap_dictionary_array::(array) + } else if let Some(run_array) = + any.downcast_ref::>() + { + wrap_ree_array::( + View::Int16, + array.data_type(), + run_array, + ) + } else if let Some(run_array) = + any.downcast_ref::>() + { + wrap_ree_array::( + View::Int32, + array.data_type(), + run_array, + ) + } else if let Some(run_array) = + any.downcast_ref::>() + { + wrap_ree_array::( + View::Int64, + array.data_type(), + run_array, + ) } else if let Some(array) = any.downcast_ref::() { use arrow_array::Array; @@ -905,7 +974,7 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { fields.push((type_id, meta, view)); } - match mode { + let offsets = match mode { arrow_schema::UnionMode::Dense => { let Some(offsets) = array.offsets() else { fail!( @@ -913,12 +982,7 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { "Dense unions must have an offset array" ); }; - - Ok(View::DenseUnion(DenseUnionView { - types: array.type_ids(), - offsets, - fields, - })) + Some(offsets as &[i32]) } arrow_schema::UnionMode::Sparse => { if array.offsets().is_some() { @@ -927,13 +991,14 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { "Sparse unions must not have an offset array" ); }; - - Ok(View::SparseUnion(SparseUnionView { - types: array.type_ids(), - fields, - })) + None } - } + }; + Ok(View::Union(UnionView { + types: array.type_ids(), + offsets, + fields, + })) } else { fail!( ErrorKind::Unsupported, @@ -1016,11 +1081,50 @@ fn wrap_dictionary_array( let keys: &dyn arrow_array::Array = array.keys(); Ok(View::Dictionary(DictionaryView { - indices: Box::new(keys.try_into()?), + keys: Box::new(keys.try_into()?), values: Box::new(array.values().as_ref().try_into()?), })) } +fn wrap_ree_array<'a, T, F>( + wrap: F, + dt: &arrow_schema::DataType, + array: &'a arrow_array::RunArray, +) -> Result> +where + T: arrow_array::types::RunEndIndexType, + F: FnOnce(PrimitiveView<'a, ::Native>) -> View<'a>, +{ + let arrow_schema::DataType::RunEndEncoded(run_ends_field, values_field) = dt else { + fail!( + ErrorKind::Unsupported, + "Invalid data type for run end encoded array" + ); + }; + + if run_ends_field.is_nullable() { + fail!( + ErrorKind::Unsupported, + "Nullable run ends are not supported" + ); + } + + let run_ends = wrap(PrimitiveView { + validity: None, + values: array.run_ends().values(), + }); + let values = View::try_from(array.values().as_ref())?; + + Ok(View::RunEndEncoded(RunEndEncodedView { + meta: RunEndEncodedMeta { + run_ends_name: run_ends_field.name().clone(), + values: meta_from_field(values_field.as_ref().try_into()?), + }, + run_ends: Box::new(run_ends), + values: Box::new(values), + })) +} + fn get_bits_with_offset(array: &dyn arrow_array::Array) -> Option> { let validity = array.nulls()?; Some(BitsWithOffset { diff --git a/marrow/src/impl_arrow2/impl.rs b/marrow/src/impl_arrow2/impl.rs index 89eeeb8..b7ed5bd 100644 --- a/marrow/src/impl_arrow2/impl.rs +++ b/marrow/src/impl_arrow2/impl.rs @@ -5,9 +5,9 @@ use crate::{ datatypes::{meta_from_field, DataType, Field, FieldMeta, IntervalUnit, TimeUnit, UnionMode}, error::{fail, ErrorKind, MarrowError, Result}, view::{ - BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, - FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, - SparseUnionView, StructView, TimeView, TimestampView, View, + BitsWithOffset, BooleanView, BytesView, DecimalView, DictionaryView, FixedSizeBinaryView, + FixedSizeListView, ListView, MapView, NullView, PrimitiveView, StructView, TimeView, + TimestampView, UnionView, View, }, }; @@ -75,7 +75,7 @@ impl TryFrom<&arrow2::datatypes::DataType> for DataType { } Ok(T::Struct(res_fields)) } - AT::Dictionary(key, value, sorted) => { + AT::Dictionary(key, value, _) => { let key = match key { I::Int8 => T::Int8, I::Int16 => T::Int16, @@ -89,7 +89,6 @@ impl TryFrom<&arrow2::datatypes::DataType> for DataType { Ok(T::Dictionary( Box::new(key), Box::new(value.as_ref().try_into()?), - *sorted, )) } AT::Union(in_fields, in_type_ids, mode) => { @@ -176,52 +175,56 @@ impl TryFrom<&DataType> for arrow2::datatypes::DataType { T::FixedSizeBinary(n) => Ok(AT::FixedSizeBinary((*n).try_into()?)), T::Utf8 => Ok(AT::Utf8), T::LargeUtf8 => Ok(AT::LargeUtf8), - T::Dictionary(key, value, sorted) => match key.as_ref() { + T::Dictionary(key, value) => match key.as_ref() { T::Int8 => Ok(AT::Dictionary( I::Int8, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::Int16 => Ok(AT::Dictionary( I::Int16, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::Int32 => Ok(AT::Dictionary( I::Int32, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::Int64 => Ok(AT::Dictionary( I::Int64, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::UInt8 => Ok(AT::Dictionary( I::UInt8, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::UInt16 => Ok(AT::Dictionary( I::UInt16, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::UInt32 => Ok(AT::Dictionary( I::UInt32, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), T::UInt64 => Ok(AT::Dictionary( I::UInt64, AT::try_from(value.as_ref())?.into(), - *sorted, + false, )), dt => fail!( ErrorKind::Unsupported, "unsupported dictionary key type {dt:?}", ), }, + T::RunEndEncoded(_, _) => fail!( + ErrorKind::Unsupported, + "RunEndEncoded is not supported by arrow2" + ), T::List(field) => Ok(AT::List(AF::try_from(field.as_ref())?.into())), T::LargeList(field) => Ok(AT::LargeList(AF::try_from(field.as_ref())?.into())), T::FixedSizeList(field, n) => Ok(AT::FixedSizeList( @@ -403,21 +406,25 @@ impl TryFrom for Box { A::LargeBinary(arr) => { build_binary_array(AT::LargeBinary, arr.offsets, arr.data, arr.validity) } - A::Dictionary(arr) => match *arr.indices { - A::Int8(indices) => build_dictionary_array(AI::Int8, indices, *arr.values), - A::Int16(indices) => build_dictionary_array(AI::Int16, indices, *arr.values), - A::Int32(indices) => build_dictionary_array(AI::Int32, indices, *arr.values), - A::Int64(indices) => build_dictionary_array(AI::Int64, indices, *arr.values), - A::UInt8(indices) => build_dictionary_array(AI::UInt8, indices, *arr.values), - A::UInt16(indices) => build_dictionary_array(AI::UInt16, indices, *arr.values), - A::UInt32(indices) => build_dictionary_array(AI::UInt32, indices, *arr.values), - A::UInt64(indices) => build_dictionary_array(AI::UInt64, indices, *arr.values), + A::Dictionary(arr) => match *arr.keys { + A::Int8(keys) => build_dictionary_array(AI::Int8, keys, *arr.values), + A::Int16(keys) => build_dictionary_array(AI::Int16, keys, *arr.values), + A::Int32(keys) => build_dictionary_array(AI::Int32, keys, *arr.values), + A::Int64(keys) => build_dictionary_array(AI::Int64, keys, *arr.values), + A::UInt8(keys) => build_dictionary_array(AI::UInt8, keys, *arr.values), + A::UInt16(keys) => build_dictionary_array(AI::UInt16, keys, *arr.values), + A::UInt32(keys) => build_dictionary_array(AI::UInt32, keys, *arr.values), + A::UInt64(keys) => build_dictionary_array(AI::UInt64, keys, *arr.values), // TODO: improve error message by including the data type _ => fail!( ErrorKind::Unsupported, - "unsupported dictionary index array during arrow2 conversion" + "Unsupported dictionary index array during arrow2 conversion" ), }, + A::RunEndEncoded(_) => fail!( + ErrorKind::Unsupported, + "RunEndEncoded is not supported by arrow2" + ), A::List(arr) => build_list_array( AT::List, arr.offsets, @@ -472,22 +479,18 @@ impl TryFrom for Box { validity, ))) } - A::DenseUnion(arr) => { - let (type_ids, fields, values) = convert_union_fields(arr.fields)?; - Ok(Box::new(arrow2::array::UnionArray::try_new( - AT::Union(fields, Some(type_ids), arrow2::datatypes::UnionMode::Dense), - arr.types.into(), - values, - Some(arr.offsets.into()), - )?)) - } - A::SparseUnion(arr) => { + A::Union(arr) => { let (type_ids, fields, values) = convert_union_fields(arr.fields)?; + let (offsets, mode) = if let Some(offsets) = arr.offsets { + (Some(offsets.into()), arrow2::datatypes::UnionMode::Dense) + } else { + (None, arrow2::datatypes::UnionMode::Sparse) + }; Ok(Box::new(arrow2::array::UnionArray::try_new( - AT::Union(fields, Some(type_ids), arrow2::datatypes::UnionMode::Sparse), + AT::Union(fields, Some(type_ids), mode), arr.types.into(), values, - None, + offsets, )?)) } A::FixedSizeList(arr) => { @@ -620,20 +623,19 @@ fn field_from_array_and_meta( } fn build_dictionary_array( - indices_type: arrow2::datatypes::IntegerType, - indices: PrimitiveArray, + keys_type: arrow2::datatypes::IntegerType, + keys: PrimitiveArray, values: Array, ) -> Result> { let values: Box = values.try_into()?; - let validity = indices + let validity = keys .validity - .map(|v| arrow2::bitmap::Bitmap::from_u8_vec(v, indices.values.len())); - let keys = - arrow2::array::PrimitiveArray::new(indices_type.into(), indices.values.into(), validity); + .map(|v| arrow2::bitmap::Bitmap::from_u8_vec(v, keys.values.len())); + let keys = arrow2::array::PrimitiveArray::new(keys_type.into(), keys.values.into(), validity); Ok(Box::new(arrow2::array::DictionaryArray::try_new( arrow2::datatypes::DataType::Dictionary( - indices_type, + keys_type, Box::new(values.data_type().clone()), false, ), @@ -877,7 +879,7 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { )); } - match mode { + let offsets = match mode { arrow2::datatypes::UnionMode::Dense => { let Some(offsets) = array.offsets() else { fail!( @@ -885,12 +887,7 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { "DenseUnion array without offsets are not supported" ); }; - - Ok(V::DenseUnion(DenseUnionView { - types, - offsets: offsets.as_slice(), - fields, - })) + Some(offsets.as_slice()) } arrow2::datatypes::UnionMode::Sparse => { if array.offsets().is_some() { @@ -899,9 +896,14 @@ impl<'a> TryFrom<&'a dyn arrow2::array::Array> for View<'a> { "SparseUnion array with offsets are not supported" ); }; - Ok(V::SparseUnion(SparseUnionView { types, fields })) + None } - } + }; + Ok(V::Union(UnionView { + types, + offsets, + fields, + })) } else if let Some(array) = any.downcast_ref::() { let AT::FixedSizeList(field, _) = array.data_type() else { fail!( @@ -963,7 +965,7 @@ fn view_dictionary_array< array: &'a arrow2::array::DictionaryArray, ) -> Result> { Ok(DictionaryView { - indices: Box::new(index_type(view_primitive_array(array.keys()))), + keys: Box::new(index_type(view_primitive_array(array.keys()))), values: Box::new(array.values().as_ref().try_into()?), }) } diff --git a/marrow/src/view.rs b/marrow/src/view.rs index 5a01cc0..6597913 100644 --- a/marrow/src/view.rs +++ b/marrow/src/view.rs @@ -4,7 +4,10 @@ use half::f16; use crate::{ - datatypes::{FieldMeta, MapMeta, TimeUnit}, + datatypes::{ + field_from_meta, DataType, Field, FieldMeta, IntervalUnit, MapMeta, RunEndEncodedMeta, + TimeUnit, UnionMode, + }, error::{fail, ErrorKind, Result}, types::{DayTimeInterval, MonthDayNanoInterval}, }; @@ -87,12 +90,106 @@ pub enum View<'a> { FixedSizeList(FixedSizeListView<'a>), /// See [`Array::Dictionary`][crate::array::Array::Dictionary] Dictionary(DictionaryView<'a>), + /// See [`Array::RunEndEncoded`][crate::array::Array::RunEndEncoded] + RunEndEncoded(RunEndEncodedView<'a>), /// See [`Array::Map`][crate::array::Array::Map] Map(MapView<'a>), - /// See [`Array::DenseUnion`][crate::array::Array::DenseUnion] - DenseUnion(DenseUnionView<'a>), - /// See [`Array::SparseUnion`][crate::array::Array::SparseUnion] - SparseUnion(SparseUnionView<'a>), + /// See [`Array::Union`][crate::array::Array::Union] + Union(UnionView<'a>), +} + +impl<'a> View<'a> { + /// Get the data type of this array + pub fn data_type(&self) -> DataType { + use DataType as T; + match self { + Self::Null(_) => T::Null, + Self::Boolean(_) => T::Boolean, + Self::Int8(_) => T::Int8, + Self::Int16(_) => T::Int16, + Self::Int32(_) => T::Int32, + Self::Int64(_) => T::Int64, + Self::UInt8(_) => T::UInt8, + Self::UInt16(_) => T::UInt16, + Self::UInt32(_) => T::UInt32, + Self::UInt64(_) => T::UInt64, + Self::Float16(_) => T::Float16, + Self::Float32(_) => T::Float32, + Self::Float64(_) => T::Float64, + Self::Decimal128(arr) => T::Decimal128(arr.precision, arr.scale), + Self::Date32(_) => T::Date32, + Self::Date64(_) => T::Date64, + Self::Time32(arr) => T::Time32(arr.unit), + Self::Time64(arr) => T::Time64(arr.unit), + Self::Timestamp(arr) => T::Timestamp(arr.unit, arr.timezone.clone()), + Self::Duration(arr) => T::Duration(arr.unit), + Self::DayTimeInterval(_) => T::Interval(IntervalUnit::DayTime), + Self::YearMonthInterval(_) => T::Interval(IntervalUnit::YearMonth), + Self::MonthDayNanoInterval(_) => T::Interval(IntervalUnit::MonthDayNano), + Self::Binary(_) => T::Binary, + Self::LargeBinary(_) => T::LargeBinary, + Self::FixedSizeBinary(arr) => T::FixedSizeBinary(arr.n), + Self::Utf8(_) => T::Utf8, + Self::LargeUtf8(_) => T::LargeUtf8, + Self::Dictionary(arr) => T::Dictionary( + Box::new(arr.keys.data_type()), + Box::new(arr.values.data_type()), + ), + Self::List(arr) => T::List(Box::new(field_from_meta( + arr.elements.data_type(), + arr.meta.clone(), + ))), + Self::LargeList(arr) => T::LargeList(Box::new(field_from_meta( + arr.elements.data_type(), + arr.meta.clone(), + ))), + Self::FixedSizeList(arr) => T::FixedSizeList( + Box::new(field_from_meta(arr.elements.data_type(), arr.meta.clone())), + arr.n, + ), + Self::Struct(arr) => T::Struct( + arr.fields + .iter() + .map(|(meta, field)| field_from_meta(field.data_type(), meta.clone())) + .collect(), + ), + Self::Union(arr) => T::Union( + arr.fields + .iter() + .map(|(type_id, meta, field)| { + (*type_id, field_from_meta(field.data_type(), meta.clone())) + }) + .collect(), + match arr.offsets { + Some(_) => UnionMode::Dense, + None => UnionMode::Sparse, + }, + ), + Self::Map(arr) => T::Map( + Box::new(Field { + name: arr.meta.entries_name.clone(), + data_type: DataType::Struct(vec![ + field_from_meta(arr.keys.data_type(), arr.meta.keys.clone()), + field_from_meta(arr.values.data_type(), arr.meta.values.clone()), + ]), + ..Field::default() + }), + arr.meta.sorted, + ), + Self::RunEndEncoded(arr) => T::RunEndEncoded( + Box::new(Field { + name: arr.meta.run_ends_name.clone(), + data_type: arr.run_ends.data_type(), + nullable: false, + metadata: Default::default(), + }), + Box::new(field_from_meta( + arr.values.data_type(), + arr.meta.values.clone(), + )), + ), + } + } } /// A bitmap with an optional offset @@ -281,28 +378,30 @@ pub struct DecimalView<'a, T> { /// See [`DictionaryArray`][crate::array::DictionaryArray] #[derive(Clone, Debug, PartialEq)] pub struct DictionaryView<'a> { - /// See [`DictionaryArray::indices`][crate::array::DictionaryArray::indices] - pub indices: Box>, + /// See [`DictionaryArray::keys`][crate::array::DictionaryArray::keys] + pub keys: Box>, /// See [`DictionaryArray::values`][crate::array::DictionaryArray::values] pub values: Box>, } -/// See [`DenseUnionArray`][crate::array::DenseUnionArray] +/// See [`UnionArray`][crate::array::UnionArray] #[derive(Clone, Debug, PartialEq)] -pub struct DenseUnionView<'a> { - /// See [`DenseUnionArray::types`][crate::array::DenseUnionArray::types] +pub struct UnionView<'a> { + /// See [`UnionArray::types`][crate::array::UnionArray::types] pub types: &'a [i8], - /// See [`DenseUnionArray::offsets`][crate::array::DenseUnionArray::offsets] - pub offsets: &'a [i32], - /// See [`DenseUnionArray::fields`][crate::array::DenseUnionArray::fields] + /// See [`UnionArray::offsets`][crate::array::UnionArray::offsets] + pub offsets: Option<&'a [i32]>, + /// See [`UnionArray::fields`][crate::array::UnionArray::fields] pub fields: Vec<(i8, FieldMeta, View<'a>)>, } -/// See [`SparseUnionArray`][crate::array::SparseUnionArray] -#[derive(Debug, Clone, PartialEq)] -pub struct SparseUnionView<'a> { - /// See [`SparseUnionArray::types`][crate::array::SparseUnionArray::types] - pub types: &'a [i8], - /// See [`SparseUnionArray::fields`][crate::array::SparseUnionArray::fields] - pub fields: Vec<(i8, FieldMeta, View<'a>)>, +/// See [`RunEndEncodedArray`][crate::array::RunEndEncodedArray] +#[derive(Clone, Debug, PartialEq)] +pub struct RunEndEncodedView<'a> { + /// See [`RunEndEncodedArray::meta`][crate::array::RunEndEncodedArray::meta] + pub meta: RunEndEncodedMeta, + /// See [`RunEndEncodedArray::run_ends`][crate::array::RunEndEncodedArray::run_ends] + pub run_ends: Box>, + /// See [`RunEndEncodedArray::values`][crate::array::RunEndEncodedArray::values] + pub values: Box>, } diff --git a/test_with_arrow/src/tests/arrays.rs b/test_with_arrow/src/tests/arrays.rs index 570ef2e..e75d700 100644 --- a/test_with_arrow/src/tests/arrays.rs +++ b/test_with_arrow/src/tests/arrays.rs @@ -1504,7 +1504,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int8(PrimitiveArray { + keys: Box::new(Array::Int8(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1528,7 +1528,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int8(PrimitiveArray { + keys: Box::new(Array::Int8(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1548,7 +1548,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int16(PrimitiveArray { + keys: Box::new(Array::Int16(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1572,7 +1572,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int16(PrimitiveArray { + keys: Box::new(Array::Int16(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1592,7 +1592,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int32(PrimitiveArray { + keys: Box::new(Array::Int32(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1616,7 +1616,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int32(PrimitiveArray { + keys: Box::new(Array::Int32(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1636,7 +1636,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int64(PrimitiveArray { + keys: Box::new(Array::Int64(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1660,7 +1660,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::Int64(PrimitiveArray { + keys: Box::new(Array::Int64(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1680,7 +1680,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt8(PrimitiveArray { + keys: Box::new(Array::UInt8(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1704,7 +1704,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt8(PrimitiveArray { + keys: Box::new(Array::UInt8(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1724,7 +1724,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt16(PrimitiveArray { + keys: Box::new(Array::UInt16(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1748,7 +1748,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt16(PrimitiveArray { + keys: Box::new(Array::UInt16(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1768,7 +1768,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt32(PrimitiveArray { + keys: Box::new(Array::UInt32(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1792,7 +1792,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt32(PrimitiveArray { + keys: Box::new(Array::UInt32(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1812,7 +1812,7 @@ mod dictionary { "a", "a", "b", "c", "c", ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt64(PrimitiveArray { + keys: Box::new(Array::UInt64(PrimitiveArray { validity: None, values: vec![0, 0, 1, 2, 2], })), @@ -1836,7 +1836,7 @@ mod dictionary { Some("c"), ])?, Array::Dictionary(DictionaryArray { - indices: Box::new(Array::UInt64(PrimitiveArray { + keys: Box::new(Array::UInt64(PrimitiveArray { validity: Some(vec![0b_11001]), values: vec![0, 0, 0, 1, 1], })), @@ -1849,3 +1849,63 @@ mod dictionary { ) } } + +mod run_end_encoded { + use marrow::{ + array::{BytesArray, RunEndEncodedArray}, + datatypes::RunEndEncodedMeta, + }; + + use super::*; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + Arc::new( + arrow_array::RunArray::::from_iter([ + "hello", "hello", "world", "foo", + ]), + ) as arrow_array::ArrayRef, + Array::RunEndEncoded(RunEndEncodedArray { + meta: RunEndEncodedMeta::default(), + run_ends: Box::new(Array::Int32(PrimitiveArray { + validity: None, + values: vec![2, 3, 4], + })), + values: Box::new(Array::Utf8(BytesArray { + validity: None, + offsets: vec![0, 5, 10, 13], + data: b"helloworldfoo".to_vec(), + })), + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + Arc::new( + arrow_array::RunArray::::from_iter([ + Some("hello"), + Some("hello"), + None, + None, + Some("world"), + Some("foo"), + ]), + ) as arrow_array::ArrayRef, + Array::RunEndEncoded(RunEndEncodedArray { + meta: RunEndEncodedMeta::default(), + run_ends: Box::new(Array::Int32(PrimitiveArray { + validity: None, + values: vec![2, 4, 5, 6], + })), + values: Box::new(Array::Utf8(BytesArray { + validity: Some(vec![0b1101]), + offsets: vec![0, 5, 5, 10, 13], + data: b"helloworldfoo".to_vec(), + })), + }), + ) + } +} diff --git a/test_with_arrow/src/tests/union_arrays.rs b/test_with_arrow/src/tests/union_arrays.rs index 0cc1d5e..5a08366 100644 --- a/test_with_arrow/src/tests/union_arrays.rs +++ b/test_with_arrow/src/tests/union_arrays.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use marrow::{ - array::{Array, DenseUnionArray, PrimitiveArray, SparseUnionArray}, + array::{Array, PrimitiveArray, UnionArray}, datatypes::FieldMeta, }; @@ -10,7 +10,7 @@ use super::utils::{assert_arrays_eq, PanicOnError}; mod dense_union_array { use super::*; - use arrow_array::{ArrayRef, Float64Array, Int32Array, UnionArray}; + use arrow_array::{ArrayRef, Float64Array, Int32Array}; // Adapted from the arrow docs // @@ -63,7 +63,7 @@ mod dense_union_array { let children = vec![Arc::new(int_array) as ArrayRef, Arc::new(float_array)]; - let array = UnionArray::try_new( + let array = arrow_array::UnionArray::try_new( union_fields.into_iter().collect(), type_ids.into(), Some(offsets.into()), @@ -77,9 +77,9 @@ mod dense_union_array { fn example() -> PanicOnError<()> { assert_arrays_eq( example_array()?, - Array::DenseUnion(DenseUnionArray { + Array::Union(UnionArray { types: vec![0, 1, 0], - offsets: vec![0, 0, 1], + offsets: Some(vec![0, 0, 1]), fields: vec![ ( 0, @@ -112,7 +112,7 @@ mod dense_union_array { mod sparse_union_array { use super::*; - use arrow_array::{ArrayRef, Float64Array, Int32Array, UnionArray}; + use arrow_array::{ArrayRef, Float64Array, Int32Array}; // Adapted from the arrow docs // @@ -151,7 +151,7 @@ mod sparse_union_array { let children = vec![Arc::new(int_array) as ArrayRef, Arc::new(float_array)]; - let array = UnionArray::try_new( + let array = arrow_array::UnionArray::try_new( union_fields.into_iter().collect(), type_ids.into_iter().collect(), None, @@ -164,8 +164,9 @@ mod sparse_union_array { fn example() -> PanicOnError<()> { assert_arrays_eq( example_array()?, - Array::SparseUnion(SparseUnionArray { + Array::Union(UnionArray { types: vec![0, 1, 0], + offsets: None, fields: vec![ ( 0, diff --git a/test_with_arrow/src/tests/utils.rs b/test_with_arrow/src/tests/utils.rs index 4cc32ae..946e442 100644 --- a/test_with_arrow/src/tests/utils.rs +++ b/test_with_arrow/src/tests/utils.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use marrow::{array::Array, view::View}; +use marrow::{array::Array, datatypes::DataType, view::View}; /// Helper to view an as the given variant macro_rules! view_as { @@ -44,21 +44,39 @@ pub fn assert_arrays_eq( marrow_array: Array, ) -> PanicOnError<()> { let array_via_marrow = arrow_array::ArrayRef::try_from(marrow_array.clone())?; + + assert_eq!( + DataType::try_from(array_via_arrow.data_type())?, + marrow_array.data_type(), + "marrow data type: arrow (left) != marrow (right)" + ); + assert_eq!( + *array_via_arrow.data_type(), + arrow_schema::DataType::try_from(&marrow_array.data_type())?, + "arrow data type: arrow (left) != marrow (right)" + ); assert_eq!( array_via_arrow.data_type(), array_via_marrow.data_type(), - "arrow (left) != marrow (right)" + "arrow data type: arrow (left) != marrow (right)" ); + assert_eq!( &array_via_arrow, &array_via_marrow, - "arrow (left) != marrow (right)" + "array: arrow (left) != marrow (right)" ); let view_via_arrow = View::try_from(&*array_via_arrow)?; let view_via_marrow = marrow_array.as_view(); + + assert_eq!( + DataType::try_from(array_via_arrow.data_type())?, + view_via_marrow.data_type(), + "view data_type: arrow (left) != marrow (right)" + ); assert_eq!( view_via_arrow, view_via_marrow, - "arrow (left) != marrow (right)" + "view: arrow (left) != marrow (right)" ); Ok(()) From 2a31c2888bf926a0537655ba1e3d83b23b2e4087 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Mon, 14 Oct 2024 19:29:00 +0200 Subject: [PATCH 8/9] Reorder changelog --- Changes.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Changes.md b/Changes.md index 4aff016..63f338b 100644 --- a/Changes.md +++ b/Changes.md @@ -2,20 +2,26 @@ ## 0.2.0 +Breaking changes: + - Rework map arrays to use explicit keys and values array to simplify interaction the underlying arrays -- Implement sparse unions, rename `DenseUnion` to `Union` and change offsets to be `Option>` -- Implement interval arrays and the `Interval` data type -- Implement run encoded array +- Rename `DenseUnion` to `Union` and change offsets to be `Option>`, implement sparse + unions - Rename `Dictionary::indices` to `Dictionary::keys` +- Remove the sorted flag from the dictionary `DataType` it is not supported by `arrow` - Rework `StructArray` and `UnionArray`: place metadata in front of arrays in `StructArray::fields`, `UnionArray::fields` + +New features + +- Add `Interval` arrays and the `Interval` data type +- Add `RunEndEncoded` arrays +- Add `Array::data_type()` and `View::data_type()` - Add `MarrowError::new` and `MarrowError::with_cause` - Add `as_view` for `Array` and the array structs - Implement `PartialEq` for `Array` and `View`, and `FieldMeta` - Implement `Default` for `Field` and `FieldMeta` -- Remove the sorted flag from the dictionary `DataType` it is not supported by `arrow` -- Add `Array::data_type()` and `View::data_type()` ## 0.1.0 From 103c425915ae4a14e862803b1c20f1a7e4c8c027 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Mon, 14 Oct 2024 19:32:39 +0200 Subject: [PATCH 9/9] Update docs --- marrow/src/view.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marrow/src/view.rs b/marrow/src/view.rs index 6597913..6a6eecd 100644 --- a/marrow/src/view.rs +++ b/marrow/src/view.rs @@ -1,6 +1,6 @@ //! Arrays with borrowed data //! -//! The views correspond 1:1 to the corresponding arrays. +//! Each view corresponds 1:1 to an array. use half::f16; use crate::{