Skip to content

Commit

Permalink
Merge pull request #11 from chmp/feature/implement-missing-types
Browse files Browse the repository at this point in the history
Implement missing types
  • Loading branch information
chmp authored Oct 14, 2024
2 parents 508f3ae + 103c425 commit 53269ed
Show file tree
Hide file tree
Showing 18 changed files with 1,286 additions and 183 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
{
"uses": "actions/checkout@v4"
},
{
"name": "system",
"run": "uname -a"
},
{
"name": "rustc",
"run": "rustc --version"
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
{
"uses": "actions/checkout@v4"
},
{
"name": "system",
"run": "uname -a"
},
{
"name": "rustc",
"run": "rustc --version"
Expand Down
16 changes: 14 additions & 2 deletions Changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,22 @@

## 0.2.0

Breaking changes:

- Rework map arrays to use explicit keys and values array to simplify interaction the underlying
arrays
- Rework `StructArray` and `DenseUnionArray`: place metadata in front of arrays in
`StructArray::fields`, `DenseUnionArray::fields`
- Rename `DenseUnion` to `Union` and change offsets to be `Option<Vec<i32>>`, implement sparse
unions
- Rename `Dictionary::indices` to `Dictionary::keys`
- Remove the sorted flag from the dictionary `DataType` it is not supported by `arrow`
- Rework `StructArray` and `UnionArray`: place metadata in front of arrays in `StructArray::fields`,
`UnionArray::fields`

New features

- Add `Interval` arrays and the `Interval` data type
- Add `RunEndEncoded` arrays
- Add `Array::data_type()` and `View::data_type()`
- Add `MarrowError::new` and `MarrowError::with_cause`
- Add `as_view` for `Array` and the array structs
- Implement `PartialEq` for `Array` and `View`, and `FieldMeta`
Expand Down
8 changes: 4 additions & 4 deletions marrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ arrow-38 = ["dep:arrow-array-38", "dep:arrow-schema-38", "dep:arrow-data-38", "d
arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"]

# support for different arrow2 versions
arrow2-0-17 = ["dep:arrow2-0-17", "dep:bytemuck", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "dep:bytemuck", "half/bytemuck"]
arrow2-0-17 = ["dep:arrow2-0-17", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "half/bytemuck"]

[dependencies]
bytemuck = { version = "1", default-features = false, features = ["derive"] }
half = { version = "2", default-features = false }

serde = { version = "1.0", default-features = false, features = ["std", "derive"], optional = true }

# arrow-version:insert: arrow-array-{version} = {{ package = "arrow-array", version = "{version}", optional = true, default-features = false }}
Expand Down Expand Up @@ -125,5 +127,3 @@ arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true, d

arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true, default-features = false }
arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true, default-features = false }

bytemuck = { version = "1", optional = true, default-features = false }
165 changes: 151 additions & 14 deletions marrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
use half::f16;

use crate::{
datatypes::{FieldMeta, MapMeta, TimeUnit},
datatypes::{
field_from_meta, DataType, Field, FieldMeta, IntervalUnit, MapMeta, RunEndEncodedMeta,
TimeUnit, UnionMode,
},
error::{fail, ErrorKind, Result},
types::{DayTimeInterval, MonthDayNanoInterval},
view::{
BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView,
FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView,
StructView, TimeView, TimestampView, View,
BitsWithOffset, BooleanView, BytesView, DecimalView, DictionaryView, FixedSizeBinaryView,
FixedSizeListView, ListView, MapView, NullView, PrimitiveView, RunEndEncodedView,
StructView, TimeView, TimestampView, UnionView, View,
},
};

Expand Down Expand Up @@ -59,6 +63,18 @@ pub enum Array {
Timestamp(TimestampArray),
/// An `i64` array of durations
Duration(TimeArray<i64>),
/// Interval with `YearMonth` unit
///
/// Interval arrays are not supported for `arrow2`.
YearMonthInterval(PrimitiveArray<i32>),
/// Interval with `DayTime` unit
///
/// Interval arrays are not supported for `arrow2`.
DayTimeInterval(PrimitiveArray<DayTimeInterval>),
/// Interval with `MonthDayNano` unit
///
/// Interval arrays are not supported for `arrow2`.
MonthDayNanoInterval(PrimitiveArray<MonthDayNanoInterval>),
/// A `[u8]` array with `i32` offsets of strings
Utf8(BytesArray<i32>),
/// A `[u8]` array with `i64` offsets of strings
Expand All @@ -81,13 +97,107 @@ pub enum Array {
FixedSizeList(FixedSizeListArray),
/// An array of dictionaries
Dictionary(DictionaryArray),
/// An array of run end encoded values
RunEndEncoded(RunEndEncodedArray),
/// An array of maps
Map(MapArray),
/// An array of unions
DenseUnion(DenseUnionArray),
Union(UnionArray),
}

impl Array {
/// Get the data type of this array
pub fn data_type(&self) -> DataType {
use DataType as T;
match self {
Self::Null(_) => T::Null,
Self::Boolean(_) => T::Boolean,
Self::Int8(_) => T::Int8,
Self::Int16(_) => T::Int16,
Self::Int32(_) => T::Int32,
Self::Int64(_) => T::Int64,
Self::UInt8(_) => T::UInt8,
Self::UInt16(_) => T::UInt16,
Self::UInt32(_) => T::UInt32,
Self::UInt64(_) => T::UInt64,
Self::Float16(_) => T::Float16,
Self::Float32(_) => T::Float32,
Self::Float64(_) => T::Float64,
Self::Decimal128(arr) => T::Decimal128(arr.precision, arr.scale),
Self::Date32(_) => T::Date32,
Self::Date64(_) => T::Date64,
Self::Time32(arr) => T::Time32(arr.unit),
Self::Time64(arr) => T::Time64(arr.unit),
Self::Timestamp(arr) => T::Timestamp(arr.unit, arr.timezone.clone()),
Self::Duration(arr) => T::Duration(arr.unit),
Self::DayTimeInterval(_) => T::Interval(IntervalUnit::DayTime),
Self::YearMonthInterval(_) => T::Interval(IntervalUnit::YearMonth),
Self::MonthDayNanoInterval(_) => T::Interval(IntervalUnit::MonthDayNano),
Self::Binary(_) => T::Binary,
Self::LargeBinary(_) => T::LargeBinary,
Self::FixedSizeBinary(arr) => T::FixedSizeBinary(arr.n),
Self::Utf8(_) => T::Utf8,
Self::LargeUtf8(_) => T::LargeUtf8,
Self::Dictionary(arr) => T::Dictionary(
Box::new(arr.keys.data_type()),
Box::new(arr.values.data_type()),
),
Self::List(arr) => T::List(Box::new(field_from_meta(
arr.elements.data_type(),
arr.meta.clone(),
))),
Self::LargeList(arr) => T::LargeList(Box::new(field_from_meta(
arr.elements.data_type(),
arr.meta.clone(),
))),
Self::FixedSizeList(arr) => T::FixedSizeList(
Box::new(field_from_meta(arr.elements.data_type(), arr.meta.clone())),
arr.n,
),
Self::Struct(arr) => T::Struct(
arr.fields
.iter()
.map(|(meta, field)| field_from_meta(field.data_type(), meta.clone()))
.collect(),
),
Self::Union(arr) => T::Union(
arr.fields
.iter()
.map(|(type_id, meta, field)| {
(*type_id, field_from_meta(field.data_type(), meta.clone()))
})
.collect(),
match arr.offsets {
Some(_) => UnionMode::Dense,
None => UnionMode::Sparse,
},
),
Self::Map(arr) => T::Map(
Box::new(Field {
name: arr.meta.entries_name.clone(),
data_type: DataType::Struct(vec![
field_from_meta(arr.keys.data_type(), arr.meta.keys.clone()),
field_from_meta(arr.values.data_type(), arr.meta.values.clone()),
]),
..Field::default()
}),
arr.meta.sorted,
),
Self::RunEndEncoded(arr) => T::RunEndEncoded(
Box::new(Field {
name: arr.meta.run_ends_name.clone(),
data_type: arr.run_ends.data_type(),
nullable: false,
metadata: Default::default(),
}),
Box::new(field_from_meta(
arr.values.data_type(),
arr.meta.values.clone(),
)),
),
}
}

/// Get the view for this array
pub fn as_view(&self) -> View<'_> {
match self {
Expand All @@ -111,6 +221,9 @@ impl Array {
Self::Time64(array) => View::Time64(array.as_view()),
Self::Timestamp(array) => View::Timestamp(array.as_view()),
Self::Duration(array) => View::Duration(array.as_view()),
Self::YearMonthInterval(array) => View::YearMonthInterval(array.as_view()),
Self::DayTimeInterval(array) => View::DayTimeInterval(array.as_view()),
Self::MonthDayNanoInterval(array) => View::MonthDayNanoInterval(array.as_view()),
Self::Binary(array) => View::Binary(array.as_view()),
Self::LargeBinary(array) => View::LargeBinary(array.as_view()),
Self::FixedSizeBinary(array) => View::FixedSizeBinary(array.as_view()),
Expand All @@ -122,7 +235,8 @@ impl Array {
Self::Struct(array) => View::Struct(array.as_view()),
Self::Map(array) => View::Map(array.as_view()),
Self::Dictionary(array) => View::Dictionary(array.as_view()),
Self::DenseUnion(array) => View::DenseUnion(array.as_view()),
Self::RunEndEncoded(array) => View::RunEndEncoded(array.as_view()),
Self::Union(array) => View::Union(array.as_view()),
}
}
}
Expand Down Expand Up @@ -483,7 +597,7 @@ impl<T> DecimalArray<T> {
#[derive(Clone, Debug, PartialEq)]
pub struct DictionaryArray {
/// The indices into the values array for each element
pub indices: Box<Array>,
pub keys: Box<Array>,
/// The possible values of elements
pub values: Box<Array>,
}
Expand All @@ -492,7 +606,7 @@ impl DictionaryArray {
/// Get the view for this array
pub fn as_view(&self) -> DictionaryView<'_> {
DictionaryView {
indices: Box::new(self.indices.as_view()),
keys: Box::new(self.keys.as_view()),
values: Box::new(self.values.as_view()),
}
}
Expand All @@ -505,20 +619,21 @@ impl DictionaryArray {
/// well. For element `ì`, the value can be looked up by the pseudo code
/// `fields[types[i]].1[offsets[i]]`.
#[derive(Clone, Debug, PartialEq)]
pub struct DenseUnionArray {
pub struct UnionArray {
/// The type of each element
pub types: Vec<i8>,
/// The offset into the underlying arrays
pub offsets: Vec<i32>,
pub offsets: Option<Vec<i32>>,
/// The arrays with their metadata
pub fields: Vec<(i8, FieldMeta, Array)>,
}

impl DenseUnionArray {
fn as_view(&self) -> DenseUnionView<'_> {
DenseUnionView {
impl UnionArray {
/// Get the view for this array
pub fn as_view(&self) -> UnionView<'_> {
UnionView {
types: &self.types,
offsets: &self.offsets,
offsets: self.offsets.as_deref(),
fields: self
.fields
.iter()
Expand All @@ -527,3 +642,25 @@ impl DenseUnionArray {
}
}
}

/// An array with runs of deduplicated values
#[derive(Clone, Debug, PartialEq)]
pub struct RunEndEncodedArray {
/// The metadata for the arrays
pub meta: RunEndEncodedMeta,
/// The run ends for each value
pub run_ends: Box<Array>,
/// The possible values of elements
pub values: Box<Array>,
}

impl RunEndEncodedArray {
/// Get the view for this array
pub fn as_view(&self) -> RunEndEncodedView<'_> {
RunEndEncodedView {
meta: self.meta.clone(),
run_ends: Box::new(self.run_ends.as_view()),
values: Box::new(self.values.as_view()),
}
}
}
Loading

0 comments on commit 53269ed

Please sign in to comment.