Prepare APIs for extension types (#357)

jorgecarleitao · Aug 31, 2021 · 145b7c8 · 145b7c8
1 parent 640ec4a
commit 145b7c8
Show file tree

Hide file tree

Showing 85 changed files with 1,117 additions and 1,187 deletions.
diff --git a/benches/filter_kernels.rs b/benches/filter_kernels.rs
@@ -40,7 +40,8 @@ fn add_benchmark(c: &mut Criterion) {
         let size = 2usize.pow(log2_size);
 
         let filter_array = create_boolean_array(size, 0.0, 0.9);
-        let filter_array = BooleanArray::from_data(filter_array.values().clone(), None);
+        let filter_array =
+            BooleanArray::from_data(DataType::Boolean, filter_array.values().clone(), None);
 
         let arr_a = create_primitive_array::<f32>(size, DataType::Float32, 0.0);
         c.bench_function(&format!("filter 2^{} f32", log2_size), |b| {

diff --git a/guide/src/high_level.md b/guide/src/high_level.md
@@ -4,59 +4,75 @@ The simplest way to think about an arrow `Array` is that it represents
 `Vec<Option<T>>` and has a logical type (see [metadata](../metadata.md))) associated with it.
 
 Probably the simplest array in this crate is `PrimitiveArray<T>`. It can be constructed
-from an iterator as follows:
+from a slice as follows:
+
+```rust
+# use arrow2::array::{Array, PrimitiveArray};
+# fn main() {
+let array = PrimitiveArray::<i32>::from([Some(1), None, Some(123)]);
+assert_eq!(array.len(), 3)
+# }
+```
+
+from a slice of values,
+
+```rust
+# use arrow2::array::{Array, PrimitiveArray};
+# fn main() {
+let array = PrimitiveArray::<f32>::from_slice([1.0, 0.0, 123.0]);
+assert_eq!(array.len(), 3)
+# }
+```
+
+or from an iterator
 
 ```rust
 # use arrow2::array::{Array, PrimitiveArray};
-# use arrow2::datatypes::DataType;
 # fn main() {
-let array = [Some(1), None, Some(123)]
-    .iter()
-    .collect::<PrimitiveArray<i32>>();
+let array: PrimitiveArray<u64> = [Some(1), None, Some(123)].iter().collect();
 assert_eq!(array.len(), 3)
 # }
 ```
 
 A `PrimitiveArray` has 3 components:
 
-1. A physical type (`i32`)
+1. A physical type (e.g. `i32`)
 2. A logical type (e.g. `DataType::Int32`)
 3. Data
 
 The main differences from a `Vec<Option<T>>` are:
 
-* Its data is laid out in memory as a `Buffer<T>` and an `Option<Bitmap>`.
-* It has an associated logical datatype.
+* Its data is laid out in memory as a `Buffer<T>` and an `Option<Bitmap>` (see [../low_level.md])
+* It has an associated logical type (`DataType`).
 
-The first difference allows interoperability with Arrow's ecosystem and efficient SIMD operations (we will re-visit this below); the second difference is that it gives semantic meaning to the array. In the example
+The first allows interoperability with Arrow's ecosystem and efficient SIMD operations
+(we will re-visit this below); the second is that it gives semantic meaning to the array.
+In the example
 
 ```rust
 # use arrow2::array::PrimitiveArray;
 # use arrow2::datatypes::DataType;
 # fn main() {
 let ints = PrimitiveArray::<i32>::from([Some(1), None]);
-let dates = PrimitiveArray::<i32>::from([Some(1), None]);
+let dates = PrimitiveArray::<i32>::from([Some(1), None]).to(DataType::Date32);
 # }
 ```
 
-`ints` and `dates` have the same in-memory representation but different logic representations (e.g. dates are usually represented as a string).
+`ints` and `dates` have the same in-memory representation but different logic
+representations (e.g. dates are usually printed to users as "yyyy-mm-dd").
 
-Some physical types (e.g. `i32`) have a "natural" logical `DataType` (e.g. `DataType::Int32`).
-These types support a more compact notation:
+All physical types (e.g. `i32`) have a "natural" logical `DataType` (e.g. `DataType::Int32`)
+which is assigned when allocating arrays from iterators, slices, etc.
 
 ```rust
 # use arrow2::array::{Array, Int32Array, PrimitiveArray};
 # use arrow2::datatypes::DataType;
 # fn main() {
-/// Int32Array = PrimitiveArray<i32>
-let array = [Some(1), None, Some(123)].iter().collect::<Int32Array>();
-assert_eq!(array.len(), 3);
-let array = Int32Array::from(&[Some(1), None, Some(123)]);
-assert_eq!(array.len(), 3);
-let array = Int32Array::from_slice(&[1, 123]);
-assert_eq!(array.len(), 2);
+let array = PrimitiveArray::<i32>::from_slice([1, 0, 123]);
+assert_eq!(array.data_type(), &DataType::Int32);
 # }
 ```
+they can be cheaply converted to via `.to(DataType)`.
 
 The following arrays are supported:
 
@@ -68,7 +84,8 @@ The following arrays are supported:
 * `FixedSizeBinaryArray` (like `BinaryArray`, but fixed size)
 * `ListArray<i32>` and `ListArray<i64>` (nested arrays)
 * `FixedSizeListArray` (nested arrays of fixed size)
-* `StructArray` (when each row has different logical types)
+* `StructArray` (every row has multiple logical types)
+* `UnionArray` (every row has a different logical type)
 * `DictionaryArray<K>` (nested array with encoded values)
 
 ## Dynamic Array
@@ -78,104 +95,71 @@ implement the trait `Array` and can be cast to `&dyn Array`, i.e. they can be tu
 a trait object. This enables arrays to have types that are dynamic in nature.
 
 ```rust
-# use std::sync::Arc;
 # use arrow2::array::{Array, PrimitiveArray};
-# use arrow2::datatypes::DataType;
 # fn main() {
-let data = vec![
-    Some(vec![Some(1i32), Some(2), Some(3)]),
-    None,
-    Some(vec![Some(4), None, Some(6)]),
-];
-
 let a = PrimitiveArray::<i32>::from(&[Some(1), None]);
 let a: &dyn Array = &a;
 # }
 ```
 
-Note how we have not specified the inner type explicitly in the signature `ListArray<i32>`.
-Instead, `ListArray` has an inner `Array` representing all its values (available via `.values()`).
-
 ### Downcast and `as_any`
 
-Given a trait object `&dyn Array`, we know its logical type via `Array::data_type()` and can use it to downcast the array to its concrete type:
+Given a trait object `array: &dyn Array`, we know its physical type via
+`PhysicalType: array.data_type().to_physical_type()`, which we use to downcast the array
+to its concrete type:
 
 ```rust
 # use arrow2::array::{Array, PrimitiveArray};
-# use arrow2::datatypes::DataType;
+# use arrow2::datatypes::PhysicalType;
 # fn main() {
-let array = [Some(1), None, Some(123)]
-    .iter()
-    .collect::<PrimitiveArray<i32>>();
+let array = PrimitiveArray::<i32>::from(&[Some(1), None]);
 let array = &array as &dyn Array;
-
-let array = array.as_any().downcast_ref::<PrimitiveArray<i32>>().unwrap();
+// ...
+let physical_type: PhysicalType = array.data_type().to_physical_type();
 # }
 ```
 
-There is a many-to-one relationship between `DataType` and an Array (i.e. a physical representation). The relationship is the following:
-
-| `DataType`            | `PhysicalType`            |
-|-----------------------|---------------------------|
-| `UInt8`               | `PrimitiveArray<u8>`      |
-| `UInt16`              | `PrimitiveArray<u16>`     |
-| `UInt32`              | `PrimitiveArray<u32>`     |
-| `UInt64`              | `PrimitiveArray<u64>`     |
-| `Int8`                | `PrimitiveArray<i8>`      |
-| `Int16`               | `PrimitiveArray<i16>`     |
-| `Int32`               | `PrimitiveArray<i32>`     |
-| `Int64`               | `PrimitiveArray<i64>`     |
-| `Float32`             | `PrimitiveArray<f32>`     |
-| `Float64`             | `PrimitiveArray<f64>`     |
-| `Decimal(_,_)`        | `PrimitiveArray<i128>`    |
-| `Date32`              | `PrimitiveArray<i32>`     |
-| `Date64`              | `PrimitiveArray<i64>`     |
-| `Time32(_)`           | `PrimitiveArray<i32>`     |
-| `Time64(_)`           | `PrimitiveArray<i64>`     |
-| `Timestamp(_,_)`      | `PrimitiveArray<i64>`     |
-| `Interval(YearMonth)` | `PrimitiveArray<i32>`     |
-| `Interval(DayTime)`   | `PrimitiveArray<days_ms>` |
-| `Duration(_)`         | `PrimitiveArray<i64>`     |
-| `Binary`              | `BinaryArray<i32>`        |
-| `LargeBinary`         | `BinaryArray<i64>`        |
-| `Utf8`                | `Utf8Array<i32>`          |
-| `LargeUtf8`           | `Utf8Array<i64>`          |
-| `List`                | `ListArray<i32>`          |
-| `LargeList`           | `ListArray<i64>`          |
-| `FixedSizeBinary(_)`  | `FixedSizeBinaryArray`    |
-| `FixedSizeList(_,_)`  | `FixedSizeListArray`      |
-| `Struct(_)`           | `StructArray`             |
-| `Union(_,_,_)`        | `UnionArray`              |
-| `Dictionary(UInt8,_)` | `DictionaryArray<u8>`     |
-| `Dictionary(UInt16,_)`| `DictionaryArray<u16>`    |
-| `Dictionary(UInt32,_)`| `DictionaryArray<u32>`    |
-| `Dictionary(UInt64,_)`| `DictionaryArray<u64>`    |
-| `Dictionary(Int8,_)`  | `DictionaryArray<i8>`     |
-| `Dictionary(Int16,_)` | `DictionaryArray<i16>`    |
-| `Dictionary(Int32,_)` | `DictionaryArray<i32>`    |
-| `Dictionary(Int64,_)` | `DictionaryArray<i64>`    |
-
-In this context, a common pattern to write operators that receive `&dyn Array` is:
+There is a one to one relationship between each variant of `PhysicalType` (an enum) and
+an each implementation of `Array` (a struct):
+
+| `PhysicalType`    | `Array`                |
+|-------------------|------------------------|
+| `Primitive(_)`    | `PrimitiveArray<_>`    |
+| `Binary`          | `BinaryArray<i32>`     |
+| `LargeBinary`     | `BinaryArray<i64>`     |
+| `Utf8`            | `Utf8Array<i32>`       |
+| `LargeUtf8`       | `Utf8Array<i64>`       |
+| `List`            | `ListArray<i32>`       |
+| `LargeList`       | `ListArray<i64>`       |
+| `FixedSizeBinary` | `FixedSizeBinaryArray` |
+| `FixedSizeList`   | `FixedSizeListArray`   |
+| `Struct`          | `StructArray`          |
+| `Union`           | `UnionArray`           |
+| `Dictionary(_)`   | `DictionaryArray<_>`   |
+
+where `_` represents each of the variants (e.g. `PrimitiveType::Int32 <-> i32`).
+
+In this context, a common idiom in using `Array` as a trait object is as follows:
 
 ```rust
-use arrow2::datatypes::DataType;
+use arrow2::datatypes::{PhysicalType, PrimitiveType};
 use arrow2::array::{Array, PrimitiveArray};
 
 fn float_operator(array: &dyn Array) -> Result<Box<dyn Array>, String> {
-    match array.data_type() {
-        DataType::Float32 => {
+    match array.data_type().to_physical_type() {
+        PhysicalType::Primitive(PrimitiveType::Float32) => {
             let array = array.as_any().downcast_ref::<PrimitiveArray<f32>>().unwrap();
             // let array = f32-specific operator
             let array = array.clone();
             Ok(Box::new(array))
         }
-        DataType::Float64 => {
+        PhysicalType::Primitive(PrimitiveType::Float64) => {
             let array = array.as_any().downcast_ref::<PrimitiveArray<f64>>().unwrap();
             // let array = f64-specific operator
             let array = array.clone();
             Ok(Box::new(array))
         }
-        _ => Err("This operator is only valid for float point.".to_string()),
+        _ => Err("This operator is only valid for float point arrays".to_string()),
     }
 }
 ```
@@ -237,7 +221,8 @@ bitwise operations, it is often more performant to operate on chunks of bits ins
 ## Vectorized operations
 
 One of the main advantages of the arrow format and its memory layout is that
-it often enables SIMD. For example, an unary operation `op` on a `PrimitiveArray` is likely auto-vectorized on the following code:
+it often enables SIMD. For example, an unary operation `op` on a `PrimitiveArray`
+likely emits SIMD instructions on the following code:
 
 ```rust
 # use arrow2::buffer::Buffer;

diff --git a/guide/src/metadata.md b/guide/src/metadata.md
@@ -12,16 +12,18 @@ semantical types defined in Arrow.
 In Arrow2, logical types are declared as variants of the `enum` `arrow2::datatypes::DataType`.
 For example, `DataType::Int32` represents a signed integer of 32 bits.
 
-Each logical type has an associated in-memory physical representation and is associated to specific
-semantics. For example, `Date32` has the same in-memory representation as `Int32`, but the value
-represents the number of days since UNIX epoch.
+Each `DataType` has an associated `enum PhysicalType` (many-to-one) representing the
+particular in-memory representation, and is associated to specific semantics.
+For example, both `DataType::Date32` and `DataType::Int32` have the same `PhysicalType`
+(`PhysicalType::Primitive(PrimitiveType::Int32)`) but `Date32` represents the number of
+days since UNIX epoch.
 
-Logical types are metadata: they annotate arrays with extra information about in-memory data.
+Logical types are metadata: they annotate physical types with extra information about data.
 
 ## `Field` (column metadata)
 
-Besides logical types, the arrow format supports other relevant metadata to the format. All this 
-information is stored in `arrow2::datatypes::Field`.
+Besides logical types, the arrow format supports other relevant metadata to the format.
+All this information is stored in `arrow2::datatypes::Field`.
 
 A `Field` is arrow's metadata associated to a column in the context of a columnar format. 
 It has a name, a logical type `DataType`, whether the column is nullable, etc.

diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs
@@ -44,6 +44,11 @@ unsafe impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
             validity = validity.map(|x| x.slice(offset, length))
         }
 
-        Ok(Self::from_data(offsets, values, validity))
+        Ok(Self::from_data(
+            Self::default_data_type(),
+            offsets,
+            values,
+            validity,
+        ))
     }
 }
diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs
@@ -31,7 +31,7 @@ impl<O: Offset> BinaryArray<O> {
         // soundness: I is `TrustedLen`
         let (validity, offsets, values) = unsafe { trusted_len_unzip(iterator) };
 
-        Self::from_data(offsets, values, validity)
+        Self::from_data(Self::default_data_type(), offsets, values, validity)
     }
 }
 

diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs
@@ -25,14 +25,15 @@ pub struct BinaryArray<O: Offset> {
 // constructors
 impl<O: Offset> BinaryArray<O> {
     /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
-    pub fn new_empty() -> Self {
-        Self::from_data(Buffer::from(&[O::zero()]), Buffer::new(), None)
+    pub fn new_empty(data_type: DataType) -> Self {
+        Self::from_data(data_type, Buffer::from(&[O::zero()]), Buffer::new(), None)
     }
 
     /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
     #[inline]
-    pub fn new_null(length: usize) -> Self {
+    pub fn new_null(data_type: DataType, length: usize) -> Self {
         Self::from_data(
+            data_type,
             Buffer::new_zeroed(length + 1),
             Buffer::new(),
             Some(Bitmap::new_zeroed(length)),
@@ -43,26 +44,40 @@ impl<O: Offset> BinaryArray<O> {
     /// # Panics
     /// * The length of the offset buffer must be larger than 1
     /// * The length of the values must be equal to the last offset value
-    pub fn from_data(offsets: Buffer<O>, values: Buffer<u8>, validity: Option<Bitmap>) -> Self {
+    pub fn from_data(
+        data_type: DataType,
+        offsets: Buffer<O>,
+        values: Buffer<u8>,
+        validity: Option<Bitmap>,
+    ) -> Self {
         check_offsets(&offsets, values.len());
 
         if let Some(validity) = &validity {
             assert_eq!(offsets.len() - 1, validity.len());
         }
 
+        if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
+            panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
+        }
+
         Self {
-            data_type: if O::is_large() {
-                DataType::LargeBinary
-            } else {
-                DataType::Binary
-            },
+            data_type,
             offsets,
             values,
             validity,
             offset: 0,
         }
     }
 
+    /// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
+    pub fn default_data_type() -> DataType {
+        if O::is_large() {
+            DataType::LargeBinary
+        } else {
+            DataType::Binary
+        }
+    }
+
     /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`].
     /// # Implementation
     /// This function is `O(1)`: all data will be shared between both arrays.