Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement missing types #11

Merged
merged 9 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
{
"uses": "actions/checkout@v4"
},
{
"name": "system",
"run": "uname -a"
},
{
"name": "rustc",
"run": "rustc --version"
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
{
"uses": "actions/checkout@v4"
},
{
"name": "system",
"run": "uname -a"
},
{
"name": "rustc",
"run": "rustc --version"
Expand Down
16 changes: 14 additions & 2 deletions Changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,22 @@

## 0.2.0

Breaking changes:

- Rework map arrays to use explicit keys and values array to simplify interaction the underlying
arrays
- Rework `StructArray` and `DenseUnionArray`: place metadata in front of arrays in
`StructArray::fields`, `DenseUnionArray::fields`
- Rename `DenseUnion` to `Union` and change offsets to be `Option<Vec<i32>>`, implement sparse
unions
- Rename `Dictionary::indices` to `Dictionary::keys`
- Remove the sorted flag from the dictionary `DataType` it is not supported by `arrow`
- Rework `StructArray` and `UnionArray`: place metadata in front of arrays in `StructArray::fields`,
`UnionArray::fields`

New features

- Add `Interval` arrays and the `Interval` data type
- Add `RunEndEncoded` arrays
- Add `Array::data_type()` and `View::data_type()`
- Add `MarrowError::new` and `MarrowError::with_cause`
- Add `as_view` for `Array` and the array structs
- Implement `PartialEq` for `Array` and `View`, and `FieldMeta`
Expand Down
8 changes: 4 additions & 4 deletions marrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ arrow-38 = ["dep:arrow-array-38", "dep:arrow-schema-38", "dep:arrow-data-38", "d
arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"]

# support for different arrow2 versions
arrow2-0-17 = ["dep:arrow2-0-17", "dep:bytemuck", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "dep:bytemuck", "half/bytemuck"]
arrow2-0-17 = ["dep:arrow2-0-17", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "half/bytemuck"]

[dependencies]
bytemuck = { version = "1", default-features = false, features = ["derive"] }
half = { version = "2", default-features = false }

serde = { version = "1.0", default-features = false, features = ["std", "derive"], optional = true }

# arrow-version:insert: arrow-array-{version} = {{ package = "arrow-array", version = "{version}", optional = true, default-features = false }}
Expand Down Expand Up @@ -125,5 +127,3 @@ arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true, d

arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true, default-features = false }
arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true, default-features = false }

bytemuck = { version = "1", optional = true, default-features = false }
165 changes: 151 additions & 14 deletions marrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
use half::f16;

use crate::{
datatypes::{FieldMeta, MapMeta, TimeUnit},
datatypes::{
field_from_meta, DataType, Field, FieldMeta, IntervalUnit, MapMeta, RunEndEncodedMeta,
TimeUnit, UnionMode,
},
error::{fail, ErrorKind, Result},
types::{DayTimeInterval, MonthDayNanoInterval},
view::{
BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView,
FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView,
StructView, TimeView, TimestampView, View,
BitsWithOffset, BooleanView, BytesView, DecimalView, DictionaryView, FixedSizeBinaryView,
FixedSizeListView, ListView, MapView, NullView, PrimitiveView, RunEndEncodedView,
StructView, TimeView, TimestampView, UnionView, View,
},
};

Expand Down Expand Up @@ -59,6 +63,18 @@ pub enum Array {
Timestamp(TimestampArray),
/// An `i64` array of durations
Duration(TimeArray<i64>),
/// Interval with `YearMonth` unit
///
/// Interval arrays are not supported for `arrow2`.
YearMonthInterval(PrimitiveArray<i32>),
/// Interval with `DayTime` unit
///
/// Interval arrays are not supported for `arrow2`.
DayTimeInterval(PrimitiveArray<DayTimeInterval>),
/// Interval with `MonthDayNano` unit
///
/// Interval arrays are not supported for `arrow2`.
MonthDayNanoInterval(PrimitiveArray<MonthDayNanoInterval>),
/// A `[u8]` array with `i32` offsets of strings
Utf8(BytesArray<i32>),
/// A `[u8]` array with `i64` offsets of strings
Expand All @@ -81,13 +97,107 @@ pub enum Array {
FixedSizeList(FixedSizeListArray),
/// An array of dictionaries
Dictionary(DictionaryArray),
/// An array of run end encoded values
RunEndEncoded(RunEndEncodedArray),
/// An array of maps
Map(MapArray),
/// An array of unions
DenseUnion(DenseUnionArray),
Union(UnionArray),
}

impl Array {
/// Get the data type of this array
pub fn data_type(&self) -> DataType {
use DataType as T;
match self {
Self::Null(_) => T::Null,
Self::Boolean(_) => T::Boolean,
Self::Int8(_) => T::Int8,
Self::Int16(_) => T::Int16,
Self::Int32(_) => T::Int32,
Self::Int64(_) => T::Int64,
Self::UInt8(_) => T::UInt8,
Self::UInt16(_) => T::UInt16,
Self::UInt32(_) => T::UInt32,
Self::UInt64(_) => T::UInt64,
Self::Float16(_) => T::Float16,
Self::Float32(_) => T::Float32,
Self::Float64(_) => T::Float64,
Self::Decimal128(arr) => T::Decimal128(arr.precision, arr.scale),
Self::Date32(_) => T::Date32,
Self::Date64(_) => T::Date64,
Self::Time32(arr) => T::Time32(arr.unit),
Self::Time64(arr) => T::Time64(arr.unit),
Self::Timestamp(arr) => T::Timestamp(arr.unit, arr.timezone.clone()),
Self::Duration(arr) => T::Duration(arr.unit),
Self::DayTimeInterval(_) => T::Interval(IntervalUnit::DayTime),
Self::YearMonthInterval(_) => T::Interval(IntervalUnit::YearMonth),
Self::MonthDayNanoInterval(_) => T::Interval(IntervalUnit::MonthDayNano),
Self::Binary(_) => T::Binary,
Self::LargeBinary(_) => T::LargeBinary,
Self::FixedSizeBinary(arr) => T::FixedSizeBinary(arr.n),
Self::Utf8(_) => T::Utf8,
Self::LargeUtf8(_) => T::LargeUtf8,
Self::Dictionary(arr) => T::Dictionary(
Box::new(arr.keys.data_type()),
Box::new(arr.values.data_type()),
),
Self::List(arr) => T::List(Box::new(field_from_meta(
arr.elements.data_type(),
arr.meta.clone(),
))),
Self::LargeList(arr) => T::LargeList(Box::new(field_from_meta(
arr.elements.data_type(),
arr.meta.clone(),
))),
Self::FixedSizeList(arr) => T::FixedSizeList(
Box::new(field_from_meta(arr.elements.data_type(), arr.meta.clone())),
arr.n,
),
Self::Struct(arr) => T::Struct(
arr.fields
.iter()
.map(|(meta, field)| field_from_meta(field.data_type(), meta.clone()))
.collect(),
),
Self::Union(arr) => T::Union(
arr.fields
.iter()
.map(|(type_id, meta, field)| {
(*type_id, field_from_meta(field.data_type(), meta.clone()))
})
.collect(),
match arr.offsets {
Some(_) => UnionMode::Dense,
None => UnionMode::Sparse,
},
),
Self::Map(arr) => T::Map(
Box::new(Field {
name: arr.meta.entries_name.clone(),
data_type: DataType::Struct(vec![
field_from_meta(arr.keys.data_type(), arr.meta.keys.clone()),
field_from_meta(arr.values.data_type(), arr.meta.values.clone()),
]),
..Field::default()
}),
arr.meta.sorted,
),
Self::RunEndEncoded(arr) => T::RunEndEncoded(
Box::new(Field {
name: arr.meta.run_ends_name.clone(),
data_type: arr.run_ends.data_type(),
nullable: false,
metadata: Default::default(),
}),
Box::new(field_from_meta(
arr.values.data_type(),
arr.meta.values.clone(),
)),
),
}
}

/// Get the view for this array
pub fn as_view(&self) -> View<'_> {
match self {
Expand All @@ -111,6 +221,9 @@ impl Array {
Self::Time64(array) => View::Time64(array.as_view()),
Self::Timestamp(array) => View::Timestamp(array.as_view()),
Self::Duration(array) => View::Duration(array.as_view()),
Self::YearMonthInterval(array) => View::YearMonthInterval(array.as_view()),
Self::DayTimeInterval(array) => View::DayTimeInterval(array.as_view()),
Self::MonthDayNanoInterval(array) => View::MonthDayNanoInterval(array.as_view()),
Self::Binary(array) => View::Binary(array.as_view()),
Self::LargeBinary(array) => View::LargeBinary(array.as_view()),
Self::FixedSizeBinary(array) => View::FixedSizeBinary(array.as_view()),
Expand All @@ -122,7 +235,8 @@ impl Array {
Self::Struct(array) => View::Struct(array.as_view()),
Self::Map(array) => View::Map(array.as_view()),
Self::Dictionary(array) => View::Dictionary(array.as_view()),
Self::DenseUnion(array) => View::DenseUnion(array.as_view()),
Self::RunEndEncoded(array) => View::RunEndEncoded(array.as_view()),
Self::Union(array) => View::Union(array.as_view()),
}
}
}
Expand Down Expand Up @@ -483,7 +597,7 @@ impl<T> DecimalArray<T> {
#[derive(Clone, Debug, PartialEq)]
pub struct DictionaryArray {
/// The indices into the values array for each element
pub indices: Box<Array>,
pub keys: Box<Array>,
/// The possible values of elements
pub values: Box<Array>,
}
Expand All @@ -492,7 +606,7 @@ impl DictionaryArray {
/// Get the view for this array
pub fn as_view(&self) -> DictionaryView<'_> {
DictionaryView {
indices: Box::new(self.indices.as_view()),
keys: Box::new(self.keys.as_view()),
values: Box::new(self.values.as_view()),
}
}
Expand All @@ -505,20 +619,21 @@ impl DictionaryArray {
/// well. For element `ì`, the value can be looked up by the pseudo code
/// `fields[types[i]].1[offsets[i]]`.
#[derive(Clone, Debug, PartialEq)]
pub struct DenseUnionArray {
pub struct UnionArray {
/// The type of each element
pub types: Vec<i8>,
/// The offset into the underlying arrays
pub offsets: Vec<i32>,
pub offsets: Option<Vec<i32>>,
/// The arrays with their metadata
pub fields: Vec<(i8, FieldMeta, Array)>,
}

impl DenseUnionArray {
fn as_view(&self) -> DenseUnionView<'_> {
DenseUnionView {
impl UnionArray {
/// Get the view for this array
pub fn as_view(&self) -> UnionView<'_> {
UnionView {
types: &self.types,
offsets: &self.offsets,
offsets: self.offsets.as_deref(),
fields: self
.fields
.iter()
Expand All @@ -527,3 +642,25 @@ impl DenseUnionArray {
}
}
}

/// An array with runs of deduplicated values
#[derive(Clone, Debug, PartialEq)]
pub struct RunEndEncodedArray {
/// The metadata for the arrays
pub meta: RunEndEncodedMeta,
/// The run ends for each value
pub run_ends: Box<Array>,
/// The possible values of elements
pub values: Box<Array>,
}

impl RunEndEncodedArray {
/// Get the view for this array
pub fn as_view(&self) -> RunEndEncodedView<'_> {
RunEndEncodedView {
meta: self.meta.clone(),
run_ends: Box::new(self.run_ends.as_view()),
values: Box::new(self.values.as_view()),
}
}
}
Loading