Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve DataType metadata in make_builder #3438

Merged
merged 2 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions arrow-array/src/builder/primitive_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::types::*;
use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray};
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -94,6 +95,7 @@ pub type Decimal256Builder = PrimitiveBuilder<Decimal256Type>;
pub struct PrimitiveBuilder<T: ArrowPrimitiveType> {
values_builder: BufferBuilder<T::Native>,
null_buffer_builder: NullBufferBuilder,
data_type: DataType,
}

impl<T: ArrowPrimitiveType> ArrayBuilder for PrimitiveBuilder<T> {
Expand Down Expand Up @@ -150,6 +152,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
Self {
values_builder: BufferBuilder::<T::Native>::new(capacity),
null_buffer_builder: NullBufferBuilder::new(capacity),
data_type: T::DATA_TYPE,
}
}

Expand All @@ -169,9 +172,29 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
Self {
values_builder,
null_buffer_builder,
data_type: T::DATA_TYPE,
}
}

/// By default [`PrimitiveBuilder`] uses [`ArrowPrimitiveType::DATA_TYPE`] as the
/// data type of the generated array.
///
/// This method allows overriding the data type, to allow specifying timezones
/// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`]
///
/// # Panics
///
/// This method panics if `data_type` is not [PrimitiveArray::is_compatible]
pub fn with_data_type(self, data_type: DataType) -> Self {
assert!(
PrimitiveArray::<T>::is_compatible(&data_type),
"incompatible data type for builder, expected {} got {}",
T::DATA_TYPE,
data_type
);
Self { data_type, ..self }
}

/// Returns the capacity of this builder measured in slots of type `T`
pub fn capacity(&self) -> usize {
self.values_builder.capacity()
Expand Down Expand Up @@ -250,7 +273,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
pub fn finish(&mut self) -> PrimitiveArray<T> {
let len = self.len();
let null_bit_buffer = self.null_buffer_builder.finish();
let builder = ArrayData::builder(T::DATA_TYPE)
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(self.values_builder.finish())
.null_bit_buffer(null_bit_buffer);
Expand All @@ -267,7 +290,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
.as_slice()
.map(Buffer::from_slice_ref);
let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice());
let builder = ArrayData::builder(T::DATA_TYPE)
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(values_buffer)
.null_bit_buffer(null_bit_buffer);
Expand Down Expand Up @@ -309,6 +332,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
mod tests {
use super::*;
use arrow_buffer::Buffer;
use arrow_schema::TimeUnit;

use crate::array::Array;
use crate::array::BooleanArray;
Expand Down Expand Up @@ -528,4 +552,30 @@ mod tests {
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}

#[test]
fn test_primitive_array_builder_with_data_type() {
let mut builder =
Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
builder.append_value(1);
let array = builder.finish();
assert_eq!(array.precision(), 1);
assert_eq!(array.scale(), 2);

let data_type =
DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string()));
let mut builder =
TimestampNanosecondBuilder::new().with_data_type(data_type.clone());
builder.append_value(1);
let array = builder.finish();
assert_eq!(array.data_type(), &data_type);
}

#[test]
#[should_panic(
expected = "incompatible data type for builder, expected Int32 got Int64"
)]
fn test_invalid_with_data_type() {
Int32Builder::new().with_data_type(DataType::Int64);
}
}
62 changes: 47 additions & 15 deletions arrow-array/src/builder/struct_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,10 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
DataType::FixedSizeBinary(len) => {
Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len))
}
DataType::Decimal128(_precision, _scale) => {
Box::new(Decimal128Builder::with_capacity(capacity))
}
DataType::Decimal128(p, s) => Box::new(
Decimal128Builder::with_capacity(capacity)
.with_data_type(DataType::Decimal128(*p, *s)),
),
Comment on lines +118 to +121
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)),
DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)),
DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)),
Expand All @@ -133,18 +134,22 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
DataType::Time64(TimeUnit::Nanosecond) => {
Box::new(Time64NanosecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Second, _) => {
Box::new(TimestampSecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Millisecond, _) => {
Box::new(TimestampMillisecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Microsecond, _) => {
Box::new(TimestampMicrosecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
Box::new(TimestampNanosecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Second, tz) => Box::new(
TimestampSecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Second, tz.clone())),
),
DataType::Timestamp(TimeUnit::Millisecond, tz) => Box::new(
TimestampMillisecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Millisecond, tz.clone())),
),
DataType::Timestamp(TimeUnit::Microsecond, tz) => Box::new(
TimestampMicrosecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
),
DataType::Timestamp(TimeUnit::Nanosecond, tz) => Box::new(
TimestampNanosecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Nanosecond, tz.clone())),
),
DataType::Interval(IntervalUnit::YearMonth) => {
Box::new(IntervalYearMonthBuilder::with_capacity(capacity))
}
Expand Down Expand Up @@ -484,6 +489,33 @@ mod tests {
assert!(builder.field_builder::<StructBuilder>(2).is_some());
}

#[test]
fn test_datatype_properties() {
let fields = vec![
Field::new("f1", DataType::Decimal128(1, 2), false),
Field::new(
"f2",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())),
false,
),
];
let mut builder = StructBuilder::from_fields(fields.clone(), 1);
builder
.field_builder::<Decimal128Builder>(0)
.unwrap()
.append_value(1);
builder
.field_builder::<TimestampMillisecondBuilder>(1)
.unwrap()
.append_value(1);
builder.append(true);
let array = builder.finish();

assert_eq!(array.data_type(), &DataType::Struct(fields.clone()));
assert_eq!(array.column(0).data_type(), fields[0].data_type());
assert_eq!(array.column(1).data_type(), fields[1].data_type());
}

#[test]
#[should_panic(
expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported"
Expand Down