diff --git a/Changes.md b/Changes.md index 410809d3..ea403fc3 100644 --- a/Changes.md +++ b/Changes.md @@ -1,5 +1,44 @@ # Change log -# 0.5.0 +## 0.6.0 + +### Removed support arrow in favor of arrow2 + +Drop support [arrow][] in favor of [arrow2][]. Arrow2 is a smaller, faster to +build implementation of the Arrow format that follow semver. It is also used by +[polars][]. That said most of the implementation is pretty generic and [arrow][] +support could be added. To convert arrow2 arrays into arrow arrays and record +batches see the [arrow2-to-arrow][] example. + +### More flexible support for Rust / Arrow features + +`serde_arrow` now supports many more Rust and Arrow features. + +- Rust: Struct, Lists, Maps, Enums, Tuples +- Arrow: Struct, List, Maps, Unions, ... + +### Removal of custom schema APIs + +`serde_arrow` no longer relies on its own schema object. Now all schema +information is retrieved from arrow fields with additional metadata. + +### More flexible APIs + +In addition to the previous API that worked on a sequence of records, +`serde_arrow` now also supports to operate on a sequence of individual items +(`serialize_into_array`, `deserialize_form_array`) and to operate on single +items (`ArraysBuilder`). + +## Support for dictionary encoded strings (categories) + +`serde_arrow` supports dictionary encoding for string arrays. This way string +arrays are encoded via a lookup table to avoid including repeated string values. + +## 0.5.0 - Bump arrow to version 16.0.0 + +[arrow]: https://github.com/apache/arrow-rs +[arrow2]: https://github.com/jorgecarleitao/arrow2 +[polars]: https://github.com/pola-rs/polars +[arrow2-to-arrow]: ./arrow2-to-arrow diff --git a/Readme.md b/Readme.md index 23c7a8ce..4023f47e 100644 --- a/Readme.md +++ b/Readme.md @@ -4,6 +4,7 @@ | [[API docs]](https://docs.rs/serde_arrow/latest/serde_arrow/) | [Changes](Changes.md) | [Example](#example) +| [Performance](#performance) | [How does it work?](serde_arrow/Implementation.md) | [Status](serde_arrow/Implementation.md#status) | [Development](#development) @@ -19,10 +20,7 @@ cumbersome to use directly. This package, `serde_arrow`, tries to bridge this gap by offering a simple way to convert Rust objects into Arrow objects and vice versa. `serde_arrow` relies on the [Serde](https://serde.rs) package to interpret Rust objects. Therefore, adding support for `serde_arrow` to custom -types is as easy as using Serde's derive macros. - -See the [implementation notes](serde_arrow/Implementation.md) for details on how -it is implemented. This package is optimized for ease of use, not performance. +types is as easy as using Serde's derive macros. [arrow2]: https://docs.rs/arrow2/latest/arrow2/ [polars]: https://github.com/pola-rs/polars @@ -77,7 +75,18 @@ import pandas as pd pd.read_parquet("example.pq") ``` -# Development +## Performance + +See the [implementation notes](serde_arrow/Implementation.md) for details on how +it is implemented. + +This package is optimized for ease of use, not performance. Depending on the +complexity of the types, a performance penality of 4x - 7x compared to manually +building the arrays can be expected. More complex types incur a smaller +performance penalty. See the [benches](serde_arrow/benches/arrow2.rs) for +details. + +## Development All common tasks are bundled in the `x.py` script: @@ -89,7 +98,7 @@ python x.py precommit Run `python x.py --help` for details. The script only uses standard Python modules can can be run without installing further packages. -# License +## License ```text Copyright (c) 2021 - 2023 Christopher Prohm diff --git a/serde_arrow/Implementation.md b/serde_arrow/Implementation.md index a12c5147..dc0b22ac 100644 --- a/serde_arrow/Implementation.md +++ b/serde_arrow/Implementation.md @@ -147,10 +147,7 @@ let val_field = fields.iter_mut().find(|field| field.name == "date").unwrap(); val_field.data_type = DataType::Date64; // only required if the datetime objects are serialized as strings -val_field.metadata.insert( - STRATEGY_KEY.to_string(), - Strategy::NaiveStrAsDate64.to_string(), -); +val_field.metadata = Strategy::NaiveStrAsDate64.into(); ``` Currently only datetime objects are supported. diff --git a/serde_arrow/Quickstart.md b/serde_arrow/Quickstart.md new file mode 100644 index 00000000..f3fe9482 --- /dev/null +++ b/serde_arrow/Quickstart.md @@ -0,0 +1,85 @@ +# Quickstart guide + +**Contents** + +1. [Working with date time objects](#working-with-date-time-objects) +2. [Dictionary encoding for strings](#dictionary-encoding-for-strings) +3. [Convert from arrow2 to arrow arrays](#convert-from-arrow2-to-arrow-arrays) + +## Working with date time objects + +When using `chrono`'s `DateTime` or `NaiveDateTime`, the values are per +default encoded as strings. To stores them as `Date64` columns, the data type +has to be modified. + +For example + +```rust +#[derive(Debug, PartialEq, Serialize, Deserialize)] +struct Record { + val: NaiveDateTime, +} + +let records: &[Record] = &[ + Record { + val: NaiveDateTime::from_timestamp(12 * 60 * 60 * 24, 0), + }, + Record { + val: NaiveDateTime::from_timestamp(9 * 60 * 60 * 24, 0), + }, +]; + +let mut fields = serialize_into_fields(records, Default::default()).unwrap(); +``` + +The traced field `val` will be of type `Utf8`. To store it as `Date64` field, +modify the data type as in + +```rust +let val_field = find_field_mut(&mut fields, "val").unwrap(); +val_field.data_type = DataType::Date64; +val_field.metadata = Strategy::NaiveStrAsDate64.into(); +``` + +## Dictionary encoding for strings + +To encode strings with repeated values via a dictionary, the data type of the +corresponding field must be changed from `Utf8` or `LargeUtf8` to `Dictionary`. + +For an existing field this can be done via: + +```rust +field.data_type = DataType::Dictionary( + // the integer type used for the keys + IntegerType::UInt32, + // the data type of the values + Box::new(DataType::Utf8), + // serde_arrow does not support sorted generating sorted dictionaries + false, +); +``` + +To dictionary encode all string fields, set the `string_dictionary_encoding` of +`TracingOptions`, when tracing the fields: + +```rust +let fields = serialize_into_fields( + &items, + TracingOptions::default().string_dictionary_encoding(true), +)?; +``` + +## Convert from arrow2 to arrow arrays + +Both `arrow` and `arrow2` use the Arrow memory format. Thanks to this fact, it +is possible to convert arrays between both packages with minimal work using +their respective FFI interfaces: + +- [arrow2::ffi::export_field_to_c](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_field_to_c.html) +- [arrow2::ffi_export_array_to_ce](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_array_to_c.html) +- [arrow::ffi::ArrowArray::new](https://docs.rs/arrow/latest/arrow/ffi/struct.ArrowArray.html#method.new) + +A fully worked example can be found in the [arrow2-to-arrow][] example of the +`serde_arrow` repository. + +[arrow2-to-arrow]: https://github.com/chmp/serde_arrow/tree/main/arrow2-to-arrow diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 4fb7e3f8..d52571de 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -240,7 +240,7 @@ impl FieldBuilder for MapTracer { /// # use serde::Serialize; /// # /// use serde_arrow::{ -/// arrow2::{serialize_into_fields, experimental::find_field_mut}, +/// arrow2::{serialize_into_fields, experimental}, /// schema::{Strategy, TracingOptions}, /// }; /// @@ -260,7 +260,7 @@ impl FieldBuilder for MapTracer { /// TracingOptions::default(), /// ).unwrap(); /// -/// let dt_field = find_field_mut(&mut fields, "b.dt").unwrap(); +/// let dt_field = experimental::find_field_mut(&mut fields, "b.dt").unwrap(); /// dt_field.data_type = DataType::Date64; /// dt_field.metadata = Strategy::NaiveStrAsDate64.into(); /// ``` diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 2ce38739..5257c6c5 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -34,8 +34,9 @@ //! # fn main() { } //! ``` //! -//! See [implementation] for an explanation of how this package works and its -//! underlying data model. +//! See the [quickstart guide][docs::quickstart] for more examples of how to use +//! this package. See the [implementation notes][docs::implementation] for an +//! explanation of how this package works and its underlying data model. //! //! # Features: //! @@ -45,7 +46,7 @@ //! # Status //! //! For an overview over the supported Arrow and Rust types see status section -//! in the [implementation notes][implementation] +//! in the [implementation notes][docs::implementation] //! mod internal; @@ -82,10 +83,7 @@ pub mod base { /// # use serde_arrow::schema::{STRATEGY_KEY, Strategy}; /// # let mut field = Field::new("my_field", DataType::Null, false); /// field.data_type = DataType::Date64; -/// field.metadata.insert( -/// STRATEGY_KEY.to_string(), -/// Strategy::UtcStrAsDate64.to_string(), -/// ); +/// field.metadata = Strategy::UtcStrAsDate64.into(); /// # } /// # #[cfg(not(feature="arrow2"))] /// # fn main() {} @@ -100,7 +98,13 @@ pub mod schema { pub use crate::internal::schema::{Strategy, TracingOptions, STRATEGY_KEY}; } -#[doc = include_str!("../Implementation.md")] -// NOTE: hide the implementation documentation from doctests -#[cfg(not(doctest))] -pub mod implementation {} +/// Documentation only modules +pub mod docs { + #[doc = include_str!("../Implementation.md")] + #[cfg(not(doctest))] + pub mod implementation {} + + #[doc = include_str!("../Quickstart.md")] + #[cfg(not(doctest))] + pub mod quickstart {} +} diff --git a/serde_arrow/src/test/arrow2/round_trip.rs b/serde_arrow/src/test/arrow2/round_trip.rs index fd546582..372259d9 100644 --- a/serde_arrow/src/test/arrow2/round_trip.rs +++ b/serde_arrow/src/test/arrow2/round_trip.rs @@ -12,10 +12,7 @@ use serde::{Deserialize, Serialize}; use crate::{ arrow2::{deserialize_from_arrays, serialize_into_arrays, serialize_into_fields}, - internal::{ - event::Event, - schema::{Strategy, STRATEGY_KEY}, - }, + internal::{event::Event, schema::Strategy}, test::arrow2::utils::{collect_events_from_array, field}, }; @@ -40,10 +37,7 @@ fn dtype_date64_naive_str() { let val_field = fields.iter_mut().find(|field| field.name == "val").unwrap(); val_field.data_type = DataType::Date64; - val_field.metadata.insert( - STRATEGY_KEY.to_string(), - Strategy::NaiveStrAsDate64.to_string(), - ); + val_field.metadata = Strategy::NaiveStrAsDate64.into(); println!("{fields:?}"); @@ -97,10 +91,7 @@ fn dtype_date64_str() { let mut fields = serialize_into_fields(records, Default::default()).unwrap(); let val_field = fields.iter_mut().find(|field| field.name == "val").unwrap(); val_field.data_type = DataType::Date64; - val_field.metadata.insert( - STRATEGY_KEY.to_string(), - Strategy::UtcStrAsDate64.to_string(), - ); + val_field.metadata = Strategy::UtcStrAsDate64.into(); let arrays = serialize_into_arrays(&fields, records).unwrap(); diff --git a/serde_arrow/src/test/arrow2/schema_events.rs b/serde_arrow/src/test/arrow2/schema_events.rs index 0748ac63..8b330dc7 100644 --- a/serde_arrow/src/test/arrow2/schema_events.rs +++ b/serde_arrow/src/test/arrow2/schema_events.rs @@ -466,10 +466,7 @@ fn struct_tuple() { ]), false, ); - expected.metadata.insert( - STRATEGY_KEY.to_string(), - Strategy::TupleAsStruct.to_string(), - ); + expected.metadata = Strategy::TupleAsStruct.into(); assert_eq!(field, expected); }