From d39cf283e9a68b221a24b4132c27f34100439086 Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Fri, 15 Mar 2024 01:05:04 +0800 Subject: [PATCH 01/11] feat: initial support string_view and binary_view, supports layout and basic construction + tests (#5481) * support string_view and binary_view * fix reviewer comments --- arrow-array/src/array/byte_array.rs | 6 +- arrow-array/src/array/byte_view_array.rs | 480 ++++++++++++++++++ arrow-array/src/array/mod.rs | 7 + .../src/builder/generic_bytes_view_builder.rs | 215 ++++++++ arrow-array/src/builder/mod.rs | 3 + arrow-array/src/record_batch.rs | 28 +- arrow-array/src/types.rs | 68 +++ arrow-buffer/src/native.rs | 1 + arrow-data/src/byte_view.rs | 123 +++++ arrow-data/src/data.rs | 85 ++-- arrow-data/src/equal/byte_view.rs | 74 +++ arrow-data/src/equal/mod.rs | 4 +- arrow-data/src/lib.rs | 3 + arrow-data/src/transform/mod.rs | 172 ++++--- arrow/tests/array_equal.rs | 48 +- arrow/tests/array_transform.rs | 39 ++ 16 files changed, 1244 insertions(+), 112 deletions(-) create mode 100644 arrow-array/src/array/byte_view_array.rs create mode 100644 arrow-array/src/builder/generic_bytes_view_builder.rs create mode 100644 arrow-data/src/byte_view.rs create mode 100644 arrow-data/src/equal/byte_view.rs diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index db825bbea97d..a57abc5b1e71 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -94,7 +94,7 @@ pub struct GenericByteArray { impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { - data_type: self.data_type.clone(), + data_type: T::DATA_TYPE, value_offsets: self.value_offsets.clone(), value_data: self.value_data.clone(), nulls: self.nulls.clone(), @@ -323,7 +323,7 @@ impl GenericByteArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { - data_type: self.data_type.clone(), + data_type: T::DATA_TYPE, value_offsets: self.value_offsets.slice(offset, length), value_data: self.value_data.clone(), nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), @@ -511,7 +511,7 @@ impl From for GenericByteArray { Self { value_offsets, value_data, - data_type: data.data_type().clone(), + data_type: T::DATA_TYPE, nulls: data.nulls().cloned(), } } diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs new file mode 100644 index 000000000000..e22e9b1688bb --- /dev/null +++ b/arrow-array/src/array/byte_view_array.rs @@ -0,0 +1,480 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::print_long_array; +use crate::builder::GenericByteViewBuilder; +use crate::iterator::ArrayIter; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::Arc; + +/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. +/// +/// Different than [`crate::GenericByteArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout +pub struct GenericByteViewArray { + data_type: DataType, + views: ScalarBuffer, + buffers: Vec, + phantom: PhantomData, + nulls: Option, +} + +impl Clone for GenericByteViewArray { + fn clone(&self) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.clone(), + buffers: self.buffers.clone(), + nulls: self.nulls.clone(), + phantom: Default::default(), + } + } +} + +impl GenericByteViewArray { + /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`GenericByteViewArray::try_new`] returns an error + pub fn new(views: ScalarBuffer, buffers: Vec, nulls: Option) -> Self { + Self::try_new(views, buffers, nulls).unwrap() + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `views.len() != nulls.len()` + /// * [ByteViewType::validate] fails + pub fn try_new( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Result { + T::validate(&views, &buffers)?; + + if let Some(n) = nulls.as_ref() { + if n.len() != views.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ViewArray, expected {} got {}", + T::PREFIX, + views.len(), + n.len(), + ))); + } + } + + Ok(Self { + data_type: T::DATA_TYPE, + views, + buffers, + nulls, + phantom: Default::default(), + }) + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, without validation + /// + /// # Safety + /// + /// Safe if [`Self::try_new`] would not error + pub unsafe fn new_unchecked( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Self { + Self { + data_type: T::DATA_TYPE, + phantom: Default::default(), + views, + buffers, + nulls, + } + } + + /// Create a new [`GenericByteViewArray`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: vec![0; len].into(), + buffers: vec![], + nulls: Some(NullBuffer::new_null(len)), + phantom: Default::default(), + } + } + + /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef, + I: IntoIterator, + { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + for v in iter { + builder.append_value(v); + } + builder.finish() + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + (self.views, self.buffers, self.nulls) + } + + /// Returns the views buffer + #[inline] + pub fn views(&self) -> &ScalarBuffer { + &self.views + } + + /// Returns the buffers storing string data + #[inline] + pub fn data_buffers(&self) -> &[Buffer] { + &self.buffers + } + + /// Returns the element at index `i` + /// # Panics + /// Panics if index `i` is out of bounds. + pub fn value(&self, i: usize) -> &T::Native { + assert!( + i < self.len(), + "Trying to access an element at index {} from a {}ViewArray of length {}", + i, + T::PREFIX, + self.len() + ); + + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { + let v = self.views.get_unchecked(idx); + let len = *v as u32; + let b = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize) + } else { + let view = ByteView::from(*v); + let data = self.buffers.get_unchecked(view.buffer_index as usize); + let offset = view.offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::Native::from_bytes_unchecked(b) + } + + /// constructs a new iterator + pub fn iter(&self) -> ArrayIter<&Self> { + ArrayIter::new(self) + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.slice(offset, length), + buffers: self.buffers.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + phantom: Default::default(), + } + } +} + +impl Debug for GenericByteViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}ViewArray\n[\n", T::PREFIX)?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for GenericByteViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.views.len() + } + + fn is_empty(&self) -> bool { + self.views.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::(); + sum += self.views.inner().capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() + } +} + +impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray { + type Item = &'a T::Native; + + fn value(&self, index: usize) -> Self::Item { + GenericByteViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericByteViewArray::value_unchecked(self, index) + } +} + +impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray { + type Item = Option<&'a T::Native>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} + +impl From for GenericByteViewArray { + fn from(value: ArrayData) -> Self { + let views = value.buffers()[0].clone(); + let views = ScalarBuffer::new(views, value.offset(), value.len()); + let buffers = value.buffers()[1..].to_vec(); + Self { + data_type: T::DATA_TYPE, + views, + buffers, + nulls: value.nulls().cloned(), + phantom: Default::default(), + } + } +} + +impl From> for ArrayData { + fn from(mut array: GenericByteViewArray) -> Self { + let len = array.len(); + array.buffers.insert(0, array.views.into_inner()); + let builder = ArrayDataBuilder::new(T::DATA_TYPE) + .len(len) + .buffers(array.buffers) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } + } +} + +impl FromIterator> for GenericByteViewArray +where + Ptr: AsRef, +{ + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + builder.extend(iter); + builder.finish() + } +} + +/// A [`GenericByteViewArray`] of `[u8]` +pub type BinaryViewArray = GenericByteViewArray; + +/// A [`GenericByteViewArray`] of `str` +/// +/// ``` +/// use arrow_array::StringViewArray; +/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]); +/// assert_eq!(array.value(0), "hello"); +/// assert_eq!(array.value(3), "large payload over 12 bytes"); +/// ``` +pub type StringViewArray = GenericByteViewArray; + +impl From> for StringViewArray { + fn from(v: Vec<&str>) -> Self { + Self::from_iter_values(v) + } +} + +#[cfg(test)] +mod tests { + use crate::builder::StringViewBuilder; + use crate::{Array, BinaryViewArray, StringViewArray}; + use arrow_buffer::{Buffer, ScalarBuffer}; + use arrow_data::ByteView; + + #[test] + fn try_new() { + let array = StringViewArray::from_iter_values(vec![ + "hello", + "world", + "lulu", + "large payload over 12 bytes", + ]); + assert_eq!(array.value(0), "hello"); + assert_eq!(array.value(3), "large payload over 12 bytes"); + + let array = BinaryViewArray::from_iter_values(vec![ + b"hello".as_slice(), + b"world".as_slice(), + b"lulu".as_slice(), + b"large payload over 12 bytes".as_slice(), + ]); + assert_eq!(array.value(0), b"hello"); + assert_eq!(array.value(3), b"large payload over 12 bytes"); + + // test empty array + let array = { + let mut builder = StringViewBuilder::new(); + builder.finish() + }; + assert!(array.is_empty()); + + // test builder append + let array = { + let mut builder = StringViewBuilder::new(); + builder.append_value("hello"); + builder.append_null(); + builder.append_option(Some("large payload over 12 bytes")); + builder.finish() + }; + assert_eq!(array.value(0), "hello"); + assert!(array.is_null(1)); + assert_eq!(array.value(2), "large payload over 12 bytes"); + + // test builder's in_progress re-created + let array = { + // make a builder with small block size. + let mut builder = StringViewBuilder::new().with_block_size(14); + builder.append_value("large payload over 12 bytes"); + builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created")); + builder.finish() + }; + assert_eq!(array.value(0), "large payload over 12 bytes"); + assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"); + assert_eq!(2, array.buffers.len()); + } + + #[test] + #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] + fn new_with_invalid_view_data() { + let v = "large payload over 12 bytes"; + let view = ByteView { + length: 13, + prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()), + buffer_index: 3, + offset: 1, + }; + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![Buffer::from_slice_ref(v)]; + StringViewArray::new(views, buffers, None); + } + + #[test] + #[should_panic( + expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" + )] + fn new_with_invalid_utf8_data() { + let v: Vec = vec![0xf0, 0x80, 0x80, 0x80]; + let view = ByteView { + length: v.len() as u32, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: 0, + offset: 0, + }; + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![Buffer::from_slice_ref(v)]; + StringViewArray::new(views, buffers, None); + } + + #[test] + #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")] + fn new_with_invalid_zero_padding() { + let mut data = [0; 12]; + data[0] = b'H'; + data[11] = 1; // no zero padding + + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes()); + view_buffer[4..].copy_from_slice(&data); + + let view = ByteView::from(u128::from_le_bytes(view_buffer)); + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![]; + StringViewArray::new(views, buffers, None); + } + + #[test] + #[should_panic(expected = "Mismatch between embedded prefix and data")] + fn test_mismatch_between_embedded_prefix_and_data() { + let input_str_1 = "Hello, Rustaceans!"; + let input_str_2 = "Hallo, Rustaceans!"; + let length = input_str_1.len() as u32; + assert!(input_str_1.len() > 12); + + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); + view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]); + view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes()); + view_buffer[12..].copy_from_slice(&0u32.to_le_bytes()); + let view = ByteView::from(u128::from_le_bytes(view_buffer)); + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())]; + + StringViewArray::new(views, buffers, None); + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 7aa3f92bfbd2..b115ff9c14cc 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -65,8 +65,13 @@ mod union_array; pub use union_array::*; mod run_array; + pub use run_array::*; +mod byte_view_array; + +pub use byte_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -596,8 +601,10 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef, DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as ArrayRef, DataType::FixedSizeBinary(_) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef, + DataType::BinaryView => Arc::new(BinaryViewArray::from(data)) as ArrayRef, DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef, DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, + DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs new file mode 100644 index 000000000000..29de7feb0ec1 --- /dev/null +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::ArrayBuilder; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{ArrayRef, GenericByteViewArray}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; +use arrow_data::ByteView; +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024; + +/// A builder for [`GenericByteViewArray`] +/// +/// See [`Self::append_value`] for the allocation strategy +pub struct GenericByteViewBuilder { + views_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + completed: Vec, + in_progress: Vec, + block_size: u32, + phantom: PhantomData, +} + +impl GenericByteViewBuilder { + /// Creates a new [`GenericByteViewBuilder`]. + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. + pub fn with_capacity(capacity: usize) -> Self { + Self { + views_builder: BufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + completed: vec![], + in_progress: vec![], + block_size: DEFAULT_BLOCK_SIZE, + phantom: Default::default(), + } + } + + /// Override the size of buffers to allocate for holding string data + pub fn with_block_size(self, block_size: u32) -> Self { + Self { block_size, ..self } + } + + /// Appends a value into the builder + /// + /// # Panics + /// + /// Panics if + /// - String buffer count exceeds `u32::MAX` + /// - String length exceeds `u32::MAX` + #[inline] + pub fn append_value(&mut self, value: impl AsRef) { + let v: &[u8] = value.as_ref().as_ref(); + let length: u32 = v.len().try_into().unwrap(); + if length <= 12 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); + view_buffer[4..4 + v.len()].copy_from_slice(v); + self.views_builder.append(u128::from_le_bytes(view_buffer)); + self.null_buffer_builder.append_non_null(); + return; + } + + let required_cap = self.in_progress.len() + v.len(); + if self.in_progress.capacity() < required_cap { + let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize)); + let flushed = std::mem::replace(&mut self.in_progress, in_progress); + if !flushed.is_empty() { + assert!(self.completed.len() < u32::MAX as usize); + self.completed.push(flushed.into()); + } + }; + let offset = self.in_progress.len() as u32; + self.in_progress.extend_from_slice(v); + + let view = ByteView { + length, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: self.completed.len() as u32, + offset, + }; + self.views_builder.append(view.into()); + self.null_buffer_builder.append_non_null(); + } + + /// Append an `Option` value into the builder + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.views_builder.append(0); + } + + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { + let mut completed = std::mem::take(&mut self.completed); + if !self.in_progress.is_empty() { + completed.push(std::mem::take(&mut self.in_progress).into()); + } + let len = self.views_builder.len(); + let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); + let nulls = self.null_buffer_builder.finish(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { + let mut completed = self.completed.clone(); + if !self.in_progress.is_empty() { + completed.push(Buffer::from_slice_ref(&self.in_progress)); + } + let len = self.views_builder.len(); + let views = Buffer::from_slice_ref(self.views_builder.as_slice()); + let views = ScalarBuffer::new(views, 0, len); + let nulls = self.null_buffer_builder.finish_cloned(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } +} + +impl Default for GenericByteViewBuilder { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for GenericByteViewBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}ViewBuilder", T::PREFIX)?; + f.debug_struct("") + .field("views_builder", &self.views_builder) + .field("in_progress", &self.in_progress) + .field("completed", &self.completed) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl ArrayBuilder for GenericByteViewBuilder { + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +impl> Extend> + for GenericByteViewBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +/// Array builder for [`StringViewArray`][crate::StringViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +pub type StringViewBuilder = GenericByteViewBuilder; + +/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +pub type BinaryViewBuilder = GenericByteViewBuilder; diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index d33e565a868b..e4ab7ae4ba23 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -178,7 +178,10 @@ mod generic_bytes_dictionary_builder; pub use generic_bytes_dictionary_builder::*; mod generic_byte_run_builder; pub use generic_byte_run_builder::*; +mod generic_bytes_view_builder; +pub use generic_bytes_view_builder::*; mod union_builder; + pub use union_builder::*; use crate::ArrayRef; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 314445bba617..c56b1fd308cf 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -626,7 +626,9 @@ mod tests { use std::collections::HashMap; use super::*; - use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray}; + use crate::{ + BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray, + }; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; @@ -646,6 +648,30 @@ mod tests { check_batch(record_batch, 5) } + #[test] + fn create_string_view_record_batch() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8View, false), + ]); + + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = StringViewArray::from(vec!["a", "b", "c", "d", "e"]); + + let record_batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); + + assert_eq!(5, record_batch.num_rows()); + assert_eq!(2, record_batch.num_columns()); + assert_eq!(&DataType::Int32, record_batch.schema().field(0).data_type()); + assert_eq!( + &DataType::Utf8View, + record_batch.schema().field(1).data_type() + ); + assert_eq!(5, record_batch.column(0).len()); + assert_eq!(5, record_batch.column(1).len()); + } + #[test] fn byte_size_should_not_regress() { let schema = Schema::new(vec![ diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 83a229c1da0d..e33f7bde7cba 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -25,12 +25,14 @@ use crate::timezone::Tz; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; +use arrow_data::{validate_binary_view, validate_string_view}; use arrow_schema::{ ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; +use std::fmt::Debug; use std::marker::PhantomData; use std::ops::{Add, Sub}; @@ -1544,6 +1546,72 @@ pub type BinaryType = GenericBinaryType; /// An arrow binary array with i64 offsets pub type LargeBinaryType = GenericBinaryType; +mod byte_view { + use crate::types::{BinaryViewType, StringViewType}; + + pub trait Sealed: Send + Sync {} + impl Sealed for StringViewType {} + impl Sealed for BinaryViewType {} +} + +/// A trait over the variable length bytes view array types +pub trait ByteViewType: byte_view::Sealed + 'static + PartialEq + Send + Sync { + /// If element in array is utf8 encoded string. + const IS_UTF8: bool; + + /// Datatype of array elements + const DATA_TYPE: DataType = if Self::IS_UTF8 { + DataType::Utf8View + } else { + DataType::BinaryView + }; + + /// "Binary" or "String", for use in displayed or error messages + const PREFIX: &'static str; + + /// Type for representing its equivalent rust type i.e + /// Utf8Array will have native type has &str + /// BinaryArray will have type as [u8] + type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; + + /// Type for owned corresponding to `Native` + type Owned: Debug + Clone + Sync + Send + AsRef; + + /// Verifies that the provided buffers are valid for this array type + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError>; +} + +/// [`ByteViewType`] for string arrays +#[derive(PartialEq)] +pub struct StringViewType {} + +impl ByteViewType for StringViewType { + const IS_UTF8: bool = true; + const PREFIX: &'static str = "String"; + + type Native = str; + type Owned = String; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_string_view(views, buffers) + } +} + +/// [`BinaryViewType`] for string arrays +#[derive(PartialEq)] +pub struct BinaryViewType {} + +impl ByteViewType for BinaryViewType { + const IS_UTF8: bool = false; + const PREFIX: &'static str = "Binary"; + type Native = [u8]; + type Owned = Vec; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_binary_view(views, buffers) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 38074a8dc26c..5184d60ac1fd 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -149,6 +149,7 @@ native_integer!(u8); native_integer!(u16); native_integer!(u32); native_integer!(u64); +native_integer!(u128); macro_rules! native_float { ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => { diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs new file mode 100644 index 000000000000..b8b1731ac60b --- /dev/null +++ b/arrow-data/src/byte_view.rs @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_buffer::Buffer; +use arrow_schema::ArrowError; + +#[derive(Debug, Copy, Clone, Default)] +#[repr(C)] +pub struct ByteView { + /// The length of the string/bytes. + pub length: u32, + /// First 4 bytes of string/bytes data. + pub prefix: u32, + /// The buffer index. + pub buffer_index: u32, + /// The offset into the buffer. + pub offset: u32, +} + +impl ByteView { + #[inline(always)] + pub fn as_u128(self) -> u128 { + (self.length as u128) + | ((self.prefix as u128) << 32) + | ((self.buffer_index as u128) << 64) + | ((self.offset as u128) << 96) + } +} + +impl From for ByteView { + #[inline] + fn from(value: u128) -> Self { + Self { + length: value as u32, + prefix: (value >> 32) as u32, + buffer_index: (value >> 64) as u32, + offset: (value >> 96) as u32, + } + } +} + +impl From for u128 { + #[inline] + fn from(value: ByteView) -> Self { + value.as_u128() + } +} + +/// Validates the combination of `views` and `buffers` is a valid BinaryView +pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |_, _| Ok(())) +} + +/// Validates the combination of `views` and `buffers` is a valid StringView +pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |idx, b| { + std::str::from_utf8(b).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Encountered non-UTF-8 data at index {idx}: {e}" + )) + })?; + Ok(()) + }) +} + +fn validate_view_impl(views: &[u128], buffers: &[Buffer], f: F) -> Result<(), ArrowError> +where + F: Fn(usize, &[u8]) -> Result<(), ArrowError>, +{ + for (idx, v) in views.iter().enumerate() { + let len = *v as u32; + if len <= 12 { + if len < 12 && (v >> (32 + len * 8)) != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "View at index {idx} contained non-zero padding for string of length {len}", + ))); + } + f(idx, &v.to_le_bytes()[4..4 + len as usize])?; + } else { + let view = ByteView::from(*v); + let data = buffers.get(view.buffer_index as usize).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer index at {idx}: got index {} but only has {} buffers", + view.buffer_index, + buffers.len() + )) + })?; + + let start = view.offset as usize; + let end = start + len as usize; + let b = data.get(start..end).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer slice at {idx}: got {start}..{end} but buffer {} has length {}", + view.buffer_index, + data.len() + )) + })?; + + if !b.starts_with(&view.prefix.to_le_bytes()) { + return Err(ArrowError::InvalidArgumentError( + "Mismatch between embedded prefix and data".to_string(), + )); + } + + f(idx, b)?; + } + } + Ok(()) +} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 16637570f520..e227b168eee5 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -26,7 +26,7 @@ use std::mem; use std::ops::Range; use std::sync::Arc; -use crate::equal; +use crate::{equal, validate_binary_view, validate_string_view}; /// A collection of [`Buffer`] #[doc(hidden)] @@ -159,29 +159,6 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } -/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. -#[inline] -pub(crate) fn into_buffers( - data_type: &DataType, - buffer1: MutableBuffer, - buffer2: MutableBuffer, -) -> Vec { - match data_type { - DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], - DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { - vec![buffer1.into(), buffer2.into()] - } - DataType::Union(_, mode) => { - match mode { - // Based on Union's DataTypeLayout - UnionMode::Sparse => vec![buffer1.into()], - UnionMode::Dense => vec![buffer1.into(), buffer2.into()], - } - } - _ => vec![buffer1.into()], - } -} - /// A generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. @@ -745,7 +722,10 @@ impl ArrayData { ))); } - if self.buffers.len() != layout.buffers.len() { + // Check data buffers length for view types and other types + if self.buffers.len() < layout.buffers.len() + || (!layout.variadic && self.buffers.len() != layout.buffers.len()) + { return Err(ArrowError::InvalidArgumentError(format!( "Expected {} buffers in array of type {:?}, got {}", layout.buffers.len(), @@ -1240,6 +1220,14 @@ impl ArrayData { DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), DataType::LargeBinary => self.validate_offsets_full::(self.buffers[1].len()), + DataType::BinaryView => { + let views = self.typed_buffer::(0, self.len)?; + validate_binary_view(views, &self.buffers[1..]) + } + DataType::Utf8View => { + let views = self.typed_buffer::(0, self.len)?; + validate_string_view(views, &self.buffers[1..]) + } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) @@ -1511,10 +1499,12 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::Null => DataTypeLayout { buffers: vec![], can_contain_null_mask: false, + variadic: false, }, DataType::Boolean => DataTypeLayout { buffers: vec![BufferSpec::BitMap], can_contain_null_mask: true, + variadic: false, }, DataType::Int8 => DataTypeLayout::new_fixed_width::(), DataType::Int16 => DataTypeLayout::new_fixed_width::(), @@ -1546,15 +1536,14 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataTypeLayout { buffers: vec![spec], can_contain_null_mask: true, + variadic: false, } } DataType::Binary => DataTypeLayout::new_binary::(), DataType::LargeBinary => DataTypeLayout::new_binary::(), DataType::Utf8 => DataTypeLayout::new_binary::(), DataType::LargeUtf8 => DataTypeLayout::new_binary::(), - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), DataType::ListView(_) | DataType::LargeListView(_) => { @@ -1586,6 +1575,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { } }, can_contain_null_mask: false, + variadic: false, } } DataType::Dictionary(key_type, _value_type) => layout(key_type), @@ -1601,6 +1591,11 @@ pub struct DataTypeLayout { /// Can contain a null bitmask pub can_contain_null_mask: bool, + + /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`] + /// If `variadic` is true, the number of buffers expected is only lower-bounded by + /// buffers.len(). Buffers that exceed the lower bound are legal. + pub variadic: bool, } impl DataTypeLayout { @@ -1612,6 +1607,7 @@ impl DataTypeLayout { alignment: mem::align_of::(), }], can_contain_null_mask: true, + variadic: false, } } @@ -1622,6 +1618,7 @@ impl DataTypeLayout { Self { buffers: vec![], can_contain_null_mask: true, + variadic: false, } } @@ -1640,6 +1637,19 @@ impl DataTypeLayout { BufferSpec::VariableWidth, ], can_contain_null_mask: true, + variadic: false, + } + } + + /// Describes a view type + pub fn new_view() -> Self { + Self { + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], + can_contain_null_mask: true, + variadic: true, } } } @@ -1845,7 +1855,7 @@ impl From for ArrayDataBuilder { #[cfg(test)] mod tests { use super::*; - use arrow_schema::{Field, UnionFields}; + use arrow_schema::Field; // See arrow/tests/array_data_validation.rs for test of array validation @@ -2093,23 +2103,6 @@ mod tests { assert!(!contains_nulls(Some(&buffer), 0, 0)); } - #[test] - fn test_into_buffers() { - let data_types = vec![ - DataType::Union(UnionFields::empty(), UnionMode::Dense), - DataType::Union(UnionFields::empty(), UnionMode::Sparse), - ]; - - for data_type in data_types { - let buffers = new_buffers(&data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(&data_type, buffer1, buffer2); - - let layout = layout(&data_type); - assert_eq!(buffers.len(), layout.buffers.len()); - } - } - #[test] fn test_alignment() { let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); diff --git a/arrow-data/src/equal/byte_view.rs b/arrow-data/src/equal/byte_view.rs new file mode 100644 index 000000000000..def395125366 --- /dev/null +++ b/arrow-data/src/equal/byte_view.rs @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{ArrayData, ByteView}; + +pub(super) fn byte_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_views = &lhs.buffer::(0)[lhs_start..lhs_start + len]; + let lhs_buffers = &lhs.buffers()[1..]; + let rhs_views = &rhs.buffer::(0)[rhs_start..rhs_start + len]; + let rhs_buffers = &rhs.buffers()[1..]; + + for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if lhs.is_null(idx) { + continue; + } + + let l_len_prefix = *l as u64; + let r_len_prefix = *r as u64; + // short-circuit, check length and prefix + if l_len_prefix != r_len_prefix { + return false; + } + + let len = l_len_prefix as u32; + // for inline storage, only need check view + if len <= 12 { + if l != r { + return false; + } + continue; + } + + // check buffers + let l_view = ByteView::from(*l); + let r_view = ByteView::from(*r); + + let l_buffer = &lhs_buffers[l_view.buffer_index as usize]; + let r_buffer = &rhs_buffers[r_view.buffer_index as usize]; + + // prefixes are already known to be equal; skip checking them + let len = len as usize - 4; + let l_offset = l_view.offset as usize + 4; + let r_offset = r_view.offset as usize + 4; + if l_buffer[l_offset..l_offset + len] != r_buffer[r_offset..r_offset + len] { + return false; + } + } + true +} + +#[cfg(test)] +mod tests {} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 0987fd4c5637..dba6a0186a56 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -25,6 +25,7 @@ use arrow_schema::{DataType, IntervalUnit}; use half::f16; mod boolean; +mod byte_view; mod dictionary; mod fixed_binary; mod fixed_list; @@ -41,6 +42,7 @@ mod variable_size; // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). use boolean::boolean_equal; +use byte_view::byte_view_equal; use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; use fixed_list::fixed_list_equal; @@ -97,7 +99,7 @@ fn equal_values( } DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not yet implemented") + byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::ListView(_) | DataType::LargeListView(_) => { diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index cfa0dba66c35..59a049fe96cf 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -30,3 +30,6 @@ pub mod decimal; #[cfg(feature = "ffi")] pub mod ffi; + +mod byte_view; +pub use byte_view::*; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index b14f6e771033..b0d9475afcd6 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -15,13 +15,10 @@ // specific language governing permissions and limitations // under the License. -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, ArrayDataBuilder, -}; +use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView}; use crate::bit_mask::set_bits; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_buffer::{bit_util, i256, ArrowNativeType, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use num::Integer; @@ -68,36 +65,6 @@ impl<'a> _MutableArrayData<'a> { .as_mut() .expect("MutableArrayData not nullable") } - - fn freeze(self, dictionary: Option) -> ArrayDataBuilder { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - - let nulls = self - .null_buffer - .map(|nulls| { - let bools = BooleanBuffer::new(nulls.into(), 0, self.len); - unsafe { NullBuffer::new_unchecked(bools, self.null_count) } - }) - .filter(|n| n.null_count() > 0); - - ArrayDataBuilder::new(self.data_type) - .offset(0) - .len(self.len) - .nulls(nulls) - .buffers(buffers) - .child_data(child_data) - } } fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { @@ -138,26 +105,32 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits pub struct MutableArrayData<'a> { #[allow(dead_code)] arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself + /// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to + /// mutability invariants (interior mutability): + /// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not + /// [MutableArrayData] itself data: _MutableArrayData<'a>, - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. + /// the child data of the `Array` in Dictionary arrays. + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData]. dictionary: Option, - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. + /// Variadic data buffers referenced by views + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData] + variadic_data_buffers: Vec, + + /// function used to extend values from arrays. This function's lifetime is bound to the array + /// because it reads values from it. extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. + + /// function used to extend nulls from arrays. This function's lifetime is bound to the array + /// because it reads nulls from it. extend_null_bits: Vec>, - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. + /// function used to extend nulls. + /// this is independent of the arrays and therefore has no lifetime. extend_nulls: ExtendNulls, } @@ -197,6 +170,26 @@ fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Opti } } +/// Builds an extend that adds `buffer_offset` to any buffer indices encountered +fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend { + let views = array.buffer::(0); + Box::new( + move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { + mutable + .buffer1 + .extend(views[start..start + len].iter().map(|v| { + let len = *v as u32; + if len <= 12 { + return *v; // Stored inline + } + let mut view = ByteView::from(*v); + view.buffer_index += buffer_offset; + view.into() + })) + }, + ) +} + fn build_extend(array: &ArrayData) -> Extend { match array.data_type() { DataType::Null => null::build_extend(array), @@ -224,9 +217,7 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"), DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), DataType::ListView(_) | DataType::LargeListView(_) => { unimplemented!("ListView/LargeListView not implemented") @@ -272,9 +263,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Decimal256(_, _) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, DataType::ListView(_) | DataType::LargeListView(_) => { unimplemented!("ListView/LargeListView not implemented") @@ -429,11 +418,10 @@ impl<'a> MutableArrayData<'a> { | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary + | DataType::BinaryView + | DataType::Utf8View | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } DataType::ListView(_) | DataType::LargeListView(_) => { unimplemented!("ListView/LargeListView not implemented") } @@ -566,6 +554,15 @@ impl<'a> MutableArrayData<'a> { _ => (None, false), }; + let variadic_data_buffers = match &data_type { + DataType::BinaryView | DataType::Utf8View => arrays + .iter() + .flat_map(|x| x.buffers().iter().skip(1)) + .map(Buffer::clone) + .collect(), + _ => vec![], + }; + let extend_nulls = build_extend_nulls(data_type); let extend_null_bits = arrays @@ -598,6 +595,20 @@ impl<'a> MutableArrayData<'a> { extend_values.expect("MutableArrayData::new is infallible") } + DataType::BinaryView | DataType::Utf8View => { + let mut next_offset = 0u32; + arrays + .iter() + .map(|arr| { + let num_data_buffers = (arr.buffers().len() - 1) as u32; + let offset = next_offset; + next_offset = next_offset + .checked_add(num_data_buffers) + .expect("view buffer index overflow"); + build_extend_view(arr, offset) + }) + .collect() + } _ => arrays.iter().map(|array| build_extend(array)).collect(), }; @@ -614,6 +625,7 @@ impl<'a> MutableArrayData<'a> { arrays, data, dictionary, + variadic_data_buffers, extend_values, extend_null_bits, extend_nulls, @@ -673,13 +685,55 @@ impl<'a> MutableArrayData<'a> { /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. pub fn freeze(self) -> ArrayData { - unsafe { self.data.freeze(self.dictionary).build_unchecked() } + unsafe { self.into_builder().build_unchecked() } } /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. /// This is useful for extending the default behavior of MutableArrayData. pub fn into_builder(self) -> ArrayDataBuilder { - self.data.freeze(self.dictionary) + let data = self.data; + + let buffers = match data.data_type { + DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => { + vec![] + } + DataType::BinaryView | DataType::Utf8View => { + let mut b = self.variadic_data_buffers; + b.insert(0, data.buffer1.into()); + b + } + DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { + vec![data.buffer1.into(), data.buffer2.into()] + } + DataType::Union(_, mode) => { + match mode { + // Based on Union's DataTypeLayout + UnionMode::Sparse => vec![data.buffer1.into()], + UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()], + } + } + _ => vec![data.buffer1.into()], + }; + + let child_data = match data.data_type { + DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()], + _ => data.child_data.into_iter().map(|x| x.freeze()).collect(), + }; + + let nulls = data + .null_buffer + .map(|nulls| { + let bools = BooleanBuffer::new(nulls.into(), 0, data.len); + unsafe { NullBuffer::new_unchecked(bools, data.null_count) } + }) + .filter(|n| n.null_count() > 0); + + ArrayDataBuilder::new(data.data_type) + .offset(0) + .len(data.len) + .nulls(nulls) + .buffers(buffers) + .child_data(child_data) } } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 9bd276428880..15011c547284 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -22,8 +22,8 @@ use arrow::array::{ StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; -use arrow_array::builder::{StringBuilder, StructBuilder}; -use arrow_array::{DictionaryArray, FixedSizeListArray}; +use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; +use arrow_array::{DictionaryArray, FixedSizeListArray, StringViewArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field, Fields}; @@ -307,6 +307,50 @@ fn test_fixed_size_binary_array() { test_equal(&a, &b, true); } +#[test] +fn test_string_view_equal() { + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = StringViewArray::from(vec![ + "a very long string over 12 bytes", + "foo", + "very long string over 12 bytes", + "bar", + ]); + test_equal(&a1, &a2.slice(1, 3), true); + + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + test_equal(&a1, &a2, true); + + let a1_s = a1.slice(1, 1); + let a2_s = a2.slice(1, 1); + test_equal(&a1_s, &a2_s, true); + + let a1_s = a1.slice(2, 1); + let a2_s = a2.slice(0, 1); + test_equal(&a1_s, &a2_s, false); + + // test will null value. + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = { + let mut builder = StringViewBuilder::new(); + builder.append_value("foo"); + builder.append_null(); + builder.append_option(Some("very long string over 12 bytes")); + builder.append_value("bar"); + builder.finish() + }; + test_equal(&a1, &a2, false); + + let a1_s = a1.slice(1, 2); + let a2_s = a2.slice(1, 3); + test_equal(&a1_s, &a2_s, false); + + let a1_s = a1.slice(1, 2); + let a2_s = a2.slice(2, 2); + test_equal(&a1_s, &a2_s, true); +} + #[test] fn test_string_offset() { let a = StringArray::from(vec![Some("a"), None, Some("b")]); diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 5a267c876d6a..83d3003a0586 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -22,6 +22,7 @@ use arrow::array::{ UnionArray, }; use arrow::datatypes::Int16Type; +use arrow_array::StringViewArray; use arrow_buffer::Buffer; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; @@ -1027,6 +1028,44 @@ fn test_extend_nulls_panic() { mutable.extend_nulls(2); } +#[test] +fn test_string_view() { + let a1 = + StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]).into_data(); + let a2 = StringViewArray::from_iter(vec![ + Some("bar"), + None, + Some("long string also over 12 bytes"), + ]) + .into_data(); + + a1.validate_full().unwrap(); + a2.validate_full().unwrap(); + + let mut mutable = MutableArrayData::new(vec![&a1, &a2], false, 4); + mutable.extend(1, 0, 1); + mutable.extend(0, 1, 2); + mutable.extend(0, 0, 1); + mutable.extend(1, 2, 3); + + let array = StringViewArray::from(mutable.freeze()); + assert_eq!(array.data_buffers().len(), 2); + // Should have reused data buffers + assert_eq!(array.data_buffers()[0].as_ptr(), a1.buffers()[1].as_ptr()); + assert_eq!(array.data_buffers()[1].as_ptr(), a2.buffers()[1].as_ptr()); + + let v = array.iter().collect::>(); + assert_eq!( + v, + vec![ + Some("bar"), + Some("very long string over 12 bytes"), + Some("foo"), + Some("long string also over 12 bytes") + ] + ) +} + #[test] #[should_panic(expected = "Arrays with inconsistent types passed to MutableArrayData")] fn test_mixed_types() { From c3899cea0cda046c2c635d6a2f75baee8ee1ea99 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 15 Mar 2024 06:17:09 +1300 Subject: [PATCH 02/11] Fix integer parsing of empty strings (#5504) (#5505) --- arrow-cast/src/parse.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 7f23526142cc..afa00f176293 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -439,6 +439,9 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { + if !string.as_bytes().last().is_some_and(|x| x.is_ascii_digit()) { + return None; + } match atoi::FromRadix10SignedChecked::from_radix_10_signed_checked( string.as_bytes(), ) { @@ -2303,4 +2306,22 @@ mod tests { assert_eq!(i, result.unwrap()); } } + + #[test] + fn test_parse_empty() { + assert_eq!(Int32Type::parse(""), None); + assert_eq!(Int64Type::parse(""), None); + assert_eq!(UInt32Type::parse(""), None); + assert_eq!(UInt64Type::parse(""), None); + assert_eq!(Float32Type::parse(""), None); + assert_eq!(Float64Type::parse(""), None); + assert_eq!(Int32Type::parse("+"), None); + assert_eq!(Int64Type::parse("+"), None); + assert_eq!(UInt32Type::parse("+"), None); + assert_eq!(UInt64Type::parse("+"), None); + assert_eq!(Float32Type::parse("+"), None); + assert_eq!(Float64Type::parse("+"), None); + assert_eq!(TimestampNanosecondType::parse(""), None); + assert_eq!(Date32Type::parse(""), None); + } } From 4d0316da100b20363fc68360b7fdb8bfd88f1ee7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:44:09 +1300 Subject: [PATCH 03/11] Deprecate array_to_json_array (#5515) --- arrow-json/src/writer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d8045c330481..9f63b811d74e 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -161,6 +161,7 @@ fn struct_array_to_jsonmap_array( } /// Converts an arrow [`Array`] into a `Vec` of Serde JSON [`serde_json::Value`]'s +#[deprecated(note = "Use Writer")] pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> { // For backwards compatibility, default to skip nulls array_to_json_array_internal(array, false) @@ -1837,6 +1838,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_array_to_json_array_for_fixed_size_list_array() { let expected_json = vec![ json!([0, 1, 2]), @@ -1859,6 +1861,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_array_to_json_array_for_map_array() { let expected_json = serde_json::from_value::>(json!([ [ From 5dd5418070bd6284e1ca8a5aed17f7323965b525 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Mar 2024 22:45:31 -0400 Subject: [PATCH 04/11] Minor: Add doc comments to `GenericByteViewArray` (#5512) * Minor: Add doc comments to `GenericByteViewArray` * Improve docs --- arrow-array/src/array/byte_view_array.rs | 70 +++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index e22e9b1688bb..a3b8a5dcb803 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -34,7 +34,66 @@ use std::sync::Arc; /// Different than [`crate::GenericByteArray`] as it stores both an offset and length /// meaning that take / filter operations can be implemented without copying the underlying data. /// +/// See [`StringViewArray`] for storing utf8 encoded string data and +/// [`BinaryViewArray`] for storing bytes. +/// /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout +/// +/// A `GenericByteViewArray` stores variable length byte strings. An array of +/// `N` elements is stored as `N` fixed length "views" and a variable number +/// of variable length "buffers". +/// +/// Each view is a `u128` value layout is different depending on the +/// length of the string stored at that location: +/// +/// ```text +/// ┌──────┬────────────────────────┐ +/// │length│ string value │ +/// Strings (len <= 12) │ │ (padded with 0) │ +/// └──────┴────────────────────────┘ +/// 0 31 127 +/// +/// ┌───────┬───────┬───────┬───────┐ +/// │length │prefix │ buf │offset │ +/// Strings (len > 12) │ │ │ index │ │ +/// └───────┴───────┴───────┴───────┘ +/// 0 31 63 95 127 +/// ``` +/// +/// * Strings with length <= 12 are stored directly in the view. +/// +/// * Strings with length > 12: The first four bytes are stored inline in the +/// view and the entire string is stored in one of the buffers. +/// +/// Unlike [`GenericByteArray`], there are no constraints on the offsets other +/// than they must point into a valid buffer. However, they can be out of order, +/// non continuous and overlapping. +/// +/// For example, in the following diagram, the strings "FishWasInTownToday" and +/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a +/// separate buffer while the string "LavaMonster" is stored inlined in the +/// view. In this case, the same bytes for "Fish" are used to store both strings. +/// +/// ```text +/// ┌───┐ +/// ┌──────┬──────┬──────┬──────┐ offset │...│ +/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│ +/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│ +/// ┌──────┬──────┬──────┬──────┐ │mpl│ +/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│ +/// └──────┴──────┴──────┴──────┘ │ced│ +/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│ +/// "LavaMonster" │ 11 │ LavaMonster\0 │ │hWa│ +/// └──────┴────────────────────┘ offset │sIn│ +/// 115 │Tow│ +/// │nTo│ +/// │day│ +/// u128 "views" │Yay│ +/// buffer 0 │...│ +/// └───┘ +/// ``` +/// [`GenericByteArray`]: crate::array::GenericByteArray + pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, @@ -332,10 +391,19 @@ where } /// A [`GenericByteViewArray`] of `[u8]` +/// +/// # Example +/// ``` +/// use arrow_array::BinaryViewArray; +/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]); +/// assert_eq!(array.value(0), b"hello"); +/// assert_eq!(array.value(3), b"large payload over 12 bytes"); +/// ``` pub type BinaryViewArray = GenericByteViewArray; -/// A [`GenericByteViewArray`] of `str` +/// A [`GenericByteViewArray`] that stores uf8 data /// +/// # Example /// ``` /// use arrow_array::StringViewArray; /// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]); From 773cf18e2aeb8f299585496d4cf8a1a1f8262edb Mon Sep 17 00:00:00 2001 From: Istvan Fodor <586159+istvan-fodor@users.noreply.github.com> Date: Thu, 14 Mar 2024 21:45:49 -0500 Subject: [PATCH 05/11] feat: clarifying comments in struct_builder.rs #5494 (#5499) * feat: clarifying comments in struct_builder.rs Added clarifying comments to StructBuilder about creating collection columns * fixed commented line, improved comments * Removed redundant line in comment * fixed slightly misleading comment * moved example code to comment * better comment * fixed comment type --- arrow-array/src/builder/struct_builder.rs | 77 +++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index eeb37cd8e66d..1e2e402f745f 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -25,6 +25,81 @@ use std::sync::Arc; /// /// Note that callers should make sure that methods of all the child field builders are /// properly called to maintain the consistency of the data structure. +/// +/// +/// Handling arrays with complex layouts, such as `List>>`, in Rust can be challenging due to its strong typing system. +/// To construct a collection builder ([`ListBuilder`], [`LargeListBuilder`], or [`MapBuilder`]) using [`make_builder`], multiple calls are required. This complexity arises from the recursive approach utilized by [`StructBuilder::from_fields`]. +/// +/// Initially, [`StructBuilder::from_fields`] invokes [`make_builder`], which returns a `Box`. To obtain the specific collection builder, one must first use [`StructBuilder::field_builder`] to get a `Collection<[Box]>`. Subsequently, the `values()` result from this operation can be downcast to the desired builder type. +/// +/// For example, when working with [`ListBuilder`], you would first call [`StructBuilder::field_builder::>>`] and then downcast the [`Box`] to the specific [`StructBuilder`] you need. +/// +/// For a practical example see the code below: +/// +/// ```rust +/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder}; +/// use arrow_schema::{DataType, Field, Fields}; +/// use std::sync::Arc; +/// +/// // This is an example column that has a List>> layout +/// let mut example_col = ListBuilder::new(StructBuilder::from_fields( +/// vec![Field::new( +/// "value_list", +/// DataType::List(Arc::new(Field::new( +/// "item", +/// DataType::Struct(Fields::from(vec![ +/// Field::new("key", DataType::Utf8, true), +/// Field::new("value", DataType::Utf8, true), +/// ])), //In this example we are trying to get to this builder and insert key/value pairs +/// true, +/// ))), +/// true, +/// )], +/// 0, +/// )); +/// +/// // We can obtain the StructBuilder without issues, because example_col was created with StructBuilder +/// let col_struct_builder: &mut StructBuilder = example_col.values(); +/// +/// // We can't obtain the ListBuilder with the expected generic types, because under the hood +/// // the StructBuilder was returned as a Box and passed as such to the ListBuilder constructor +/// +/// // This panics in runtime, even though we know that the builder is a ListBuilder. +/// // let sb = col_struct_builder +/// // .field_builder::>(0) +/// // .as_mut() +/// // .unwrap(); +/// +/// //To keep in line with Rust's strong typing, we fetch a ListBuilder> from the column StructBuilder first... +/// let mut list_builder_option = +/// col_struct_builder.field_builder::>>(0); +/// +/// let list_builder = list_builder_option.as_mut().unwrap(); +/// +/// // ... and then downcast the key/value pair values to a StructBuilder +/// let struct_builder = list_builder +/// .values() +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap(); +/// +/// // We can now append values to the StructBuilder +/// let key_builder = struct_builder.field_builder::(0).unwrap(); +/// key_builder.append_value("my key"); +/// +/// let value_builder = struct_builder.field_builder::(1).unwrap(); +/// value_builder.append_value("my value"); +/// +/// struct_builder.append(true); +/// list_builder.append(true); +/// col_struct_builder.append(true); +/// example_col.append(true); +/// +/// let array = example_col.finish(); +/// +/// println!("My array: {:?}", array); +/// ``` +/// pub struct StructBuilder { fields: Fields, field_builders: Vec>, @@ -88,6 +163,8 @@ impl ArrayBuilder for StructBuilder { /// Returns a builder with capacity `capacity` that corresponds to the datatype `DataType` /// This function is useful to construct arrays from an arbitrary vectors with known/expected /// schema. +/// +/// See comments on StructBuilder on how to retreive collection builders built by make_builder. pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { use crate::builder::*; match datatype { From 14bd53dc1240003f171c8655863eae188cd0880f Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Thu, 14 Mar 2024 22:47:48 -0400 Subject: [PATCH 06/11] Support dictionary encoding in structures for `FlightDataEncoder`, add documentation for `arrow_flight::encode::Dictionary` (#5488) * Add more detailed documentation for arrow_flight::encode::DicationaryHandling * fix doc link * Fix handling of nested dictionary arrays with DictionaryHandling::Hydrate * clippy * Handle large list and sparse unions * use top-level fields * PR comments --- arrow-flight/src/encode.rs | 478 +++++++++++++++++++++++++++++++++---- 1 file changed, 435 insertions(+), 43 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index bb0436816209..efd688129485 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -18,9 +18,11 @@ use std::{collections::VecDeque, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; use crate::{error::Result, FlightData, FlightDescriptor, SchemaAsIpc}; -use arrow_array::{ArrayRef, RecordBatch, RecordBatchOptions}; + +use arrow_array::{Array, ArrayRef, RecordBatch, RecordBatchOptions, UnionArray}; use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; -use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; + +use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef, UnionMode}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; @@ -323,9 +325,10 @@ impl FlightDataEncoder { None => self.encode_schema(batch.schema_ref()), }; - // encode the batch - let send_dictionaries = self.dictionary_handling == DictionaryHandling::Resend; - let batch = prepare_batch_for_flight(&batch, schema, send_dictionaries)?; + let batch = match self.dictionary_handling { + DictionaryHandling::Resend => batch, + DictionaryHandling::Hydrate => hydrate_dictionaries(&batch, schema)?, + }; for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { let (flight_dictionaries, flight_batch) = self.encoder.encode_batch(&batch)?; @@ -388,6 +391,31 @@ impl Stream for FlightDataEncoder { /// Defines how a [`FlightDataEncoder`] encodes [`DictionaryArray`]s /// /// [`DictionaryArray`]: arrow_array::DictionaryArray +/// +/// In the arrow flight protocol dictionary values and keys are sent as two separate messages. +/// When a sender is encoding a [`RecordBatch`] containing ['DictionaryArray'] columns, it will +/// first send a dictionary batch (a batch with header `MessageHeader::DictionaryBatch`) containing +/// the dictionary values. The receiver is responsible for reading this batch and maintaining state that associates +/// those dictionary values with the corresponding array using the `dict_id` as a key. +/// +/// After sending the dictionary batch the sender will send the array data in a batch with header `MessageHeader::RecordBatch`. +/// For any dictionary array batches in this message, the encoded flight message will only contain the dictionary keys. The receiver +/// is then responsible for rebuilding the `DictionaryArray` on the client side using the dictionary values from the DictionaryBatch message +/// and the keys from the RecordBatch message. +/// +/// For example, if we have a batch with a `TypedDictionaryArray<'_, UInt32Type, Utf8Type>` (a dictionary array where they keys are `u32` and the +/// values are `String`), then the DictionaryBatch will contain a `StringArray` and the RecordBatch will contain a `UInt32Array`. +/// +/// Note that since `dict_id` defined in the `Schema` is used as a key to associate dictionary values to their arrays it is required that each +/// `DictionaryArray` in a `RecordBatch` have a unique `dict_id`. +/// +/// The current implementation does not support "delta" dictionaries so a new dictionary batch will be sent each time the encoder sees a +/// dictionary which is not pointer-equal to the previously observed dictionary for a given `dict_id`. +/// +/// For clients which may not support `DictionaryEncoding`, the `DictionaryHandling::Hydrate` method will bypass the process defined above +/// and "hydrate" any `DictionaryArray` in the batch to their underlying value type (e.g. `TypedDictionaryArray<'_, UInt32Type, Utf8Type>` will +/// be sent as a `StringArray`). With this method all data will be sent in ``MessageHeader::RecordBatch` messages and the batch schema +/// will be adjusted so that all dictionary encoded fields are changed to fields of the dictionary value type. #[derive(Debug, PartialEq)] pub enum DictionaryHandling { /// Expands to the underlying type (default). This likely sends more data @@ -395,13 +423,6 @@ pub enum DictionaryHandling { /// and is more compatible with other arrow flight client implementations /// that may not support `DictionaryEncoding` /// - /// An IPC response, streaming or otherwise, defines its schema up front - /// which defines the mapping from dictionary IDs. It then sends these - /// dictionaries over the wire. - /// - /// This requires identifying the different dictionaries in use, assigning - /// them IDs, and sending new dictionaries, delta or otherwise, when needed - /// /// See also: /// * Hydrate, @@ -411,9 +432,52 @@ pub enum DictionaryHandling { /// twice. /// /// [`DictionaryArray`]: arrow_array::DictionaryArray + /// + /// This requires identifying the different dictionaries in use and assigning + // them unique IDs Resend, } +fn prepare_field_for_flight(field: &FieldRef, send_dictionaries: bool) -> Field { + match field.data_type() { + DataType::List(inner) => Field::new_list( + field.name(), + prepare_field_for_flight(inner, send_dictionaries), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + DataType::LargeList(inner) => Field::new_list( + field.name(), + prepare_field_for_flight(inner, send_dictionaries), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + DataType::Struct(fields) => { + let new_fields: Vec = fields + .iter() + .map(|f| prepare_field_for_flight(f, send_dictionaries)) + .collect(); + Field::new_struct(field.name(), new_fields, field.is_nullable()) + .with_metadata(field.metadata().clone()) + } + DataType::Union(fields, mode) => { + let (type_ids, new_fields): (Vec, Vec) = fields + .iter() + .map(|(type_id, f)| (type_id, prepare_field_for_flight(f, send_dictionaries))) + .unzip(); + + Field::new_union(field.name(), type_ids, new_fields, *mode) + } + DataType::Dictionary(_, value_type) if !send_dictionaries => Field::new( + field.name(), + value_type.as_ref().clone(), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + _ => field.as_ref().clone(), + } +} + /// Prepare an arrow Schema for transport over the Arrow Flight protocol /// /// Convert dictionary types to underlying types @@ -430,6 +494,7 @@ fn prepare_schema_for_flight(schema: &Schema, send_dictionaries: bool) -> Schema field.is_nullable(), ) .with_metadata(field.metadata().clone()), + tpe if tpe.is_nested() => prepare_field_for_flight(field, send_dictionaries), _ => field.as_ref().clone(), }) .collect(); @@ -509,22 +574,14 @@ impl FlightIpcEncoder { } } -/// Prepares a RecordBatch for transport over the Arrow Flight protocol -/// -/// This means: -/// -/// 1. Hydrates any dictionaries to its underlying type. See +/// Hydrates any dictionaries arrays in `batch` to its underlying type. See /// hydrate_dictionary for more information. -/// -fn prepare_batch_for_flight( - batch: &RecordBatch, - schema: SchemaRef, - send_dictionaries: bool, -) -> Result { - let columns = batch - .columns() +fn hydrate_dictionaries(batch: &RecordBatch, schema: SchemaRef) -> Result { + let columns = schema + .fields() .iter() - .map(|c| hydrate_dictionary(c, send_dictionaries)) + .zip(batch.columns()) + .map(|(field, c)| hydrate_dictionary(c, field.data_type())) .collect::>>()?; let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); @@ -534,22 +591,43 @@ fn prepare_batch_for_flight( )?) } -/// Hydrates a dictionary to its underlying type if send_dictionaries is false. If send_dictionaries -/// is true, dictionaries are sent with every batch which is not as optimal as described in [DictionaryHandling::Hydrate] above, -/// but does enable sending DictionaryArray's via Flight. -fn hydrate_dictionary(array: &ArrayRef, send_dictionaries: bool) -> Result { - let arr = match array.data_type() { - DataType::Dictionary(_, value) if !send_dictionaries => arrow_cast::cast(array, value)?, - _ => Arc::clone(array), +/// Hydrates a dictionary to its underlying type. +fn hydrate_dictionary(array: &ArrayRef, data_type: &DataType) -> Result { + let arr = match (array.data_type(), data_type) { + (DataType::Union(_, UnionMode::Sparse), DataType::Union(fields, UnionMode::Sparse)) => { + let union_arr = array.as_any().downcast_ref::().unwrap(); + + let (type_ids, fields): (Vec, Vec<&FieldRef>) = fields.iter().unzip(); + + Arc::new(UnionArray::try_new( + &type_ids, + union_arr.type_ids().inner().clone(), + None, + fields + .iter() + .enumerate() + .map(|(col, field)| { + Ok(( + field.as_ref().clone(), + arrow_cast::cast(union_arr.child(col as i8), field.data_type())?, + )) + }) + .collect::>>()?, + )?) + } + (_, data_type) => arrow_cast::cast(array, data_type)?, }; Ok(arr) } #[cfg(test)] mod tests { + use arrow_array::builder::StringDictionaryBuilder; use arrow_array::*; use arrow_array::{cast::downcast_array, types::*}; + use arrow_buffer::Buffer; use arrow_cast::pretty::pretty_format_batches; + use arrow_schema::UnionMode; use std::collections::HashMap; use crate::decode::{DecodedPayload, FlightDataDecoder}; @@ -570,8 +648,8 @@ mod tests { let (_, baseline_flight_batch) = make_flight_data(&batch, &options); let big_batch = batch.slice(0, batch.num_rows() - 1); - let optimized_big_batch = prepare_batch_for_flight(&big_batch, Arc::clone(schema), false) - .expect("failed to optimize"); + let optimized_big_batch = + hydrate_dictionaries(&big_batch, Arc::clone(schema)).expect("failed to optimize"); let (_, optimized_big_flight_batch) = make_flight_data(&optimized_big_batch, &options); assert_eq!( @@ -581,8 +659,7 @@ mod tests { let small_batch = batch.slice(0, 1); let optimized_small_batch = - prepare_batch_for_flight(&small_batch, Arc::clone(schema), false) - .expect("failed to optimize"); + hydrate_dictionaries(&small_batch, Arc::clone(schema)).expect("failed to optimize"); let (_, optimized_small_flight_batch) = make_flight_data(&optimized_small_batch, &options); assert!( @@ -592,19 +669,29 @@ mod tests { #[tokio::test] async fn test_dictionary_hydration() { - let arr: DictionaryArray = vec!["a", "a", "b"].into_iter().collect(); + let arr1: DictionaryArray = vec!["a", "a", "b"].into_iter().collect(); + let arr2: DictionaryArray = vec!["c", "c", "d"].into_iter().collect(); + let schema = Arc::new(Schema::new(vec![Field::new_dictionary( "dict", DataType::UInt16, DataType::Utf8, false, )])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); - let encoder = - FlightDataEncoderBuilder::default().build(futures::stream::once(async { Ok(batch) })); + let batch1 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr1)]).unwrap(); + let batch2 = RecordBatch::try_new(schema, vec![Arc::new(arr2)]).unwrap(); + + let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]); + + let encoder = FlightDataEncoderBuilder::default().build(stream); let mut decoder = FlightDataDecoder::new(encoder); let expected_schema = Schema::new(vec![Field::new("dict", DataType::Utf8, false)]); let expected_schema = Arc::new(expected_schema); + let mut expected_arrays = vec![ + StringArray::from(vec!["a", "a", "b"]), + StringArray::from(vec!["c", "c", "d"]), + ] + .into_iter(); while let Some(decoded) = decoder.next().await { let decoded = decoded.unwrap(); match decoded.payload { @@ -612,7 +699,7 @@ mod tests { DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), DecodedPayload::RecordBatch(b) => { assert_eq!(b.schema(), expected_schema); - let expected_array = StringArray::from(vec!["a", "a", "b"]); + let expected_array = expected_arrays.next().unwrap(); let actual_array = b.column_by_name("dict").unwrap(); let actual_array = downcast_array::(actual_array); @@ -622,6 +709,311 @@ mod tests { } } + #[tokio::test] + async fn test_dictionary_list_hydration() { + let mut builder = builder::ListBuilder::new(StringDictionaryBuilder::::new()); + + builder.append_value(vec![Some("a"), None, Some("b")]); + + let arr1 = builder.finish(); + + builder.append_value(vec![Some("c"), None, Some("d")]); + + let arr2 = builder.finish(); + + let schema = Arc::new(Schema::new(vec![Field::new_list( + "dict_list", + Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), + true, + )])); + + let batch1 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr1)]).unwrap(); + let batch2 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr2)]).unwrap(); + + let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]); + + let encoder = FlightDataEncoderBuilder::default().build(stream); + + let mut decoder = FlightDataDecoder::new(encoder); + let expected_schema = Schema::new(vec![Field::new_list( + "dict_list", + Field::new("item", DataType::Utf8, true), + true, + )]); + + let expected_schema = Arc::new(expected_schema); + + let mut expected_arrays = vec![ + StringArray::from_iter(vec![Some("a"), None, Some("b")]), + StringArray::from_iter(vec![Some("c"), None, Some("d")]), + ] + .into_iter(); + + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), expected_schema); + let expected_array = expected_arrays.next().unwrap(); + let list_array = + downcast_array::(b.column_by_name("dict_list").unwrap()); + let elem_array = downcast_array::(list_array.value(0).as_ref()); + + assert_eq!(elem_array, expected_array); + } + } + } + } + + #[tokio::test] + async fn test_dictionary_struct_hydration() { + let struct_fields = vec![Field::new_list( + "dict_list", + Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), + true, + )]; + + let mut builder = builder::ListBuilder::new(StringDictionaryBuilder::::new()); + + builder.append_value(vec![Some("a"), None, Some("b")]); + + let arr1 = Arc::new(builder.finish()); + let arr1 = StructArray::new(struct_fields.clone().into(), vec![arr1], None); + + builder.append_value(vec![Some("c"), None, Some("d")]); + + let arr2 = Arc::new(builder.finish()); + let arr2 = StructArray::new(struct_fields.clone().into(), vec![arr2], None); + + let schema = Arc::new(Schema::new(vec![Field::new_struct( + "struct", + struct_fields.clone(), + true, + )])); + + let batch1 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr1)]).unwrap(); + let batch2 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr2)]).unwrap(); + + let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]); + + let encoder = FlightDataEncoderBuilder::default().build(stream); + + let mut decoder = FlightDataDecoder::new(encoder); + let expected_schema = Schema::new(vec![Field::new_struct( + "struct", + vec![Field::new_list( + "dict_list", + Field::new("item", DataType::Utf8, true), + true, + )], + true, + )]); + + let expected_schema = Arc::new(expected_schema); + + let mut expected_arrays = vec![ + StringArray::from_iter(vec![Some("a"), None, Some("b")]), + StringArray::from_iter(vec![Some("c"), None, Some("d")]), + ] + .into_iter(); + + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), expected_schema); + let expected_array = expected_arrays.next().unwrap(); + let struct_array = + downcast_array::(b.column_by_name("struct").unwrap()); + let list_array = downcast_array::(struct_array.column(0)); + + let elem_array = downcast_array::(list_array.value(0).as_ref()); + + assert_eq!(elem_array, expected_array); + } + } + } + } + + #[tokio::test] + async fn test_dictionary_union_hydration() { + let struct_fields = vec![Field::new_list( + "dict_list", + Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), + true, + )]; + + let type_ids = vec![0, 1, 2]; + let union_fields = vec![ + Field::new_list( + "dict_list", + Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), + true, + ), + Field::new_struct("struct", struct_fields.clone(), true), + Field::new("string", DataType::Utf8, true), + ]; + + let struct_fields = vec![Field::new_list( + "dict_list", + Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), + true, + )]; + + let mut builder = builder::ListBuilder::new(StringDictionaryBuilder::::new()); + + builder.append_value(vec![Some("a"), None, Some("b")]); + + let arr1 = builder.finish(); + + let type_id_buffer = Buffer::from_slice_ref([0_i8]); + let arr1 = UnionArray::try_new( + &type_ids, + type_id_buffer, + None, + vec![ + (union_fields[0].clone(), Arc::new(arr1)), + ( + union_fields[1].clone(), + new_null_array(union_fields[1].data_type(), 1), + ), + ( + union_fields[2].clone(), + new_null_array(union_fields[2].data_type(), 1), + ), + ], + ) + .unwrap(); + + builder.append_value(vec![Some("c"), None, Some("d")]); + + let arr2 = Arc::new(builder.finish()); + let arr2 = StructArray::new(struct_fields.clone().into(), vec![arr2], None); + + let type_id_buffer = Buffer::from_slice_ref([1_i8]); + let arr2 = UnionArray::try_new( + &type_ids, + type_id_buffer, + None, + vec![ + ( + union_fields[0].clone(), + new_null_array(union_fields[0].data_type(), 1), + ), + (union_fields[1].clone(), Arc::new(arr2)), + ( + union_fields[2].clone(), + new_null_array(union_fields[2].data_type(), 1), + ), + ], + ) + .unwrap(); + + let type_id_buffer = Buffer::from_slice_ref([2_i8]); + let arr3 = UnionArray::try_new( + &type_ids, + type_id_buffer, + None, + vec![ + ( + union_fields[0].clone(), + new_null_array(union_fields[0].data_type(), 1), + ), + ( + union_fields[1].clone(), + new_null_array(union_fields[1].data_type(), 1), + ), + ( + union_fields[2].clone(), + Arc::new(StringArray::from(vec!["e"])), + ), + ], + ) + .unwrap(); + + let schema = Arc::new(Schema::new(vec![Field::new_union( + "union", + type_ids.clone(), + union_fields.clone(), + UnionMode::Sparse, + )])); + + let batch1 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr1)]).unwrap(); + let batch2 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr2)]).unwrap(); + let batch3 = RecordBatch::try_new(schema.clone(), vec![Arc::new(arr3)]).unwrap(); + + let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2), Ok(batch3)]); + + let encoder = FlightDataEncoderBuilder::default().build(stream); + + let mut decoder = FlightDataDecoder::new(encoder); + + let hydrated_struct_fields = vec![Field::new_list( + "dict_list", + Field::new("item", DataType::Utf8, true), + true, + )]; + + let hydrated_union_fields = vec![ + Field::new_list("dict_list", Field::new("item", DataType::Utf8, true), true), + Field::new_struct("struct", hydrated_struct_fields.clone(), true), + Field::new("string", DataType::Utf8, true), + ]; + + let expected_schema = Schema::new(vec![Field::new_union( + "union", + type_ids.clone(), + hydrated_union_fields, + UnionMode::Sparse, + )]); + + let expected_schema = Arc::new(expected_schema); + + let mut expected_arrays = vec![ + StringArray::from_iter(vec![Some("a"), None, Some("b")]), + StringArray::from_iter(vec![Some("c"), None, Some("d")]), + StringArray::from(vec!["e"]), + ] + .into_iter(); + + let mut batch = 0; + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), expected_schema); + let expected_array = expected_arrays.next().unwrap(); + let union_arr = + downcast_array::(b.column_by_name("union").unwrap()); + + let elem_array = match batch { + 0 => { + let list_array = downcast_array::(union_arr.child(0)); + downcast_array::(list_array.value(0).as_ref()) + } + 1 => { + let struct_array = downcast_array::(union_arr.child(1)); + let list_array = downcast_array::(struct_array.column(0)); + + downcast_array::(list_array.value(0).as_ref()) + } + _ => downcast_array::(union_arr.child(2)), + }; + + batch += 1; + + assert_eq!(elem_array, expected_array); + } + } + } + } + #[tokio::test] async fn test_send_dictionaries() { let schema = Arc::new(Schema::new(vec![Field::new_dictionary( @@ -683,7 +1075,7 @@ mod tests { ) .expect("cannot create record batch"); - prepare_batch_for_flight(&batch, batch.schema(), false).expect("failed to optimize"); + hydrate_dictionaries(&batch, batch.schema()).expect("failed to optimize"); } pub fn make_flight_data( From 78aff9c401135e9e38b862a0fb9fba5947512da7 Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Fri, 15 Mar 2024 13:34:50 +0800 Subject: [PATCH 07/11] update arrow-format (#5502) --- arrow-array/src/array/byte_view_array.rs | 1 - .../src/builder/generic_bytes_view_builder.rs | 1 + arrow-ipc/src/convert.rs | 2 +- arrow-ipc/src/gen/Message.rs | 51 ++ arrow-ipc/src/gen/Schema.rs | 475 +++++++++++++++++- format/Message.fbs | 18 +- format/Schema.fbs | 46 +- parquet/src/arrow/schema/mod.rs | 191 +++---- 8 files changed, 641 insertions(+), 144 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a3b8a5dcb803..9f3a6809d9d0 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -93,7 +93,6 @@ use std::sync::Arc; /// └───┘ /// ``` /// [`GenericByteArray`]: crate::array::GenericByteArray - pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 29de7feb0ec1..9accb932ae20 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -20,6 +20,7 @@ use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{ArrayRef, GenericByteViewArray}; use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; use arrow_data::ByteView; + use std::any::Any; use std::marker::PhantomData; use std::sync::Arc; diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a821008d89ab..b2e580241adc 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -543,7 +543,7 @@ pub(crate) fn get_fb_field_type<'a>( .as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, - BinaryView | Utf8View => unimplemented!("BinaryView/Utf8View not implemented"), + BinaryView | Utf8View => unimplemented!("unimplemented"), Utf8 => FBFieldType { type_type: crate::Type::Utf8, type_: crate::Utf8Builder::new(fbb).finish().as_union_value(), diff --git a/arrow-ipc/src/gen/Message.rs b/arrow-ipc/src/gen/Message.rs index a546b54d9170..1f49f1d9428b 100644 --- a/arrow-ipc/src/gen/Message.rs +++ b/arrow-ipc/src/gen/Message.rs @@ -25,6 +25,8 @@ use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify +// @generated + #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." @@ -636,6 +638,7 @@ impl<'a> RecordBatch<'a> { pub const VT_NODES: flatbuffers::VOffsetT = 6; pub const VT_BUFFERS: flatbuffers::VOffsetT = 8; pub const VT_COMPRESSION: flatbuffers::VOffsetT = 10; + pub const VT_VARIADICBUFFERCOUNTS: flatbuffers::VOffsetT = 12; #[inline] pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { @@ -648,6 +651,9 @@ impl<'a> RecordBatch<'a> { ) -> flatbuffers::WIPOffset> { let mut builder = RecordBatchBuilder::new(_fbb); builder.add_length(args.length); + if let Some(x) = args.variadicBufferCounts { + builder.add_variadicBufferCounts(x); + } if let Some(x) = args.compression { builder.add_compression(x); } @@ -720,6 +726,33 @@ impl<'a> RecordBatch<'a> { ) } } + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + #[inline] + pub fn variadicBufferCounts(&self) -> Option> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + RecordBatch::VT_VARIADICBUFFERCOUNTS, + None, + ) + } + } } impl flatbuffers::Verifiable for RecordBatch<'_> { @@ -746,6 +779,11 @@ impl flatbuffers::Verifiable for RecordBatch<'_> { Self::VT_COMPRESSION, false, )? + .visit_field::>>( + "variadicBufferCounts", + Self::VT_VARIADICBUFFERCOUNTS, + false, + )? .finish(); Ok(()) } @@ -755,6 +793,7 @@ pub struct RecordBatchArgs<'a> { pub nodes: Option>>, pub buffers: Option>>, pub compression: Option>>, + pub variadicBufferCounts: Option>>, } impl<'a> Default for RecordBatchArgs<'a> { #[inline] @@ -764,6 +803,7 @@ impl<'a> Default for RecordBatchArgs<'a> { nodes: None, buffers: None, compression: None, + variadicBufferCounts: None, } } } @@ -800,6 +840,16 @@ impl<'a: 'b, 'b> RecordBatchBuilder<'a, 'b> { ); } #[inline] + pub fn add_variadicBufferCounts( + &mut self, + variadicBufferCounts: flatbuffers::WIPOffset>, + ) { + self.fbb_.push_slot_always::>( + RecordBatch::VT_VARIADICBUFFERCOUNTS, + variadicBufferCounts, + ); + } + #[inline] pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> RecordBatchBuilder<'a, 'b> { let start = _fbb.start_table(); RecordBatchBuilder { @@ -821,6 +871,7 @@ impl core::fmt::Debug for RecordBatch<'_> { ds.field("nodes", &self.nodes()); ds.field("buffers", &self.buffers()); ds.field("compression", &self.compression()); + ds.field("variadicBufferCounts", &self.variadicBufferCounts()); ds.finish() } } diff --git a/arrow-ipc/src/gen/Schema.rs b/arrow-ipc/src/gen/Schema.rs index 0dc5dccd39e7..ed9dbaa249f0 100644 --- a/arrow-ipc/src/gen/Schema.rs +++ b/arrow-ipc/src/gen/Schema.rs @@ -22,6 +22,8 @@ use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify +// @generated + #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." @@ -58,7 +60,7 @@ impl MetadataVersion { pub const V3: Self = Self(2); /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. pub const V4: Self = Self(3); - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// @@ -734,13 +736,13 @@ pub const ENUM_MIN_TYPE: u8 = 0; since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] -pub const ENUM_MAX_TYPE: u8 = 22; +pub const ENUM_MAX_TYPE: u8 = 26; #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] #[allow(non_camel_case_types)] -pub const ENUM_VALUES_TYPE: [Type; 23] = [ +pub const ENUM_VALUES_TYPE: [Type; 27] = [ Type::NONE, Type::Null, Type::Int, @@ -764,6 +766,10 @@ pub const ENUM_VALUES_TYPE: [Type; 23] = [ Type::LargeUtf8, Type::LargeList, Type::RunEndEncoded, + Type::BinaryView, + Type::Utf8View, + Type::ListView, + Type::LargeListView, ]; /// ---------------------------------------------------------------------- @@ -797,9 +803,13 @@ impl Type { pub const LargeUtf8: Self = Self(20); pub const LargeList: Self = Self(21); pub const RunEndEncoded: Self = Self(22); + pub const BinaryView: Self = Self(23); + pub const Utf8View: Self = Self(24); + pub const ListView: Self = Self(25); + pub const LargeListView: Self = Self(26); pub const ENUM_MIN: u8 = 0; - pub const ENUM_MAX: u8 = 22; + pub const ENUM_MAX: u8 = 26; pub const ENUM_VALUES: &'static [Self] = &[ Self::NONE, Self::Null, @@ -824,6 +834,10 @@ impl Type { Self::LargeUtf8, Self::LargeList, Self::RunEndEncoded, + Self::BinaryView, + Self::Utf8View, + Self::ListView, + Self::LargeListView, ]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { @@ -851,6 +865,10 @@ impl Type { Self::LargeUtf8 => Some("LargeUtf8"), Self::LargeList => Some("LargeList"), Self::RunEndEncoded => Some("RunEndEncoded"), + Self::BinaryView => Some("BinaryView"), + Self::Utf8View => Some("Utf8View"), + Self::ListView => Some("ListView"), + Self::LargeListView => Some("LargeListView"), _ => None, } } @@ -1545,6 +1563,165 @@ impl core::fmt::Debug for LargeList<'_> { ds.finish() } } +pub enum ListViewOffset {} +#[derive(Copy, Clone, PartialEq)] + +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +pub struct ListView<'a> { + pub _tab: flatbuffers::Table<'a>, +} + +impl<'a> flatbuffers::Follow<'a> for ListView<'a> { + type Inner = ListView<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } +} + +impl<'a> ListView<'a> { + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + ListView { _tab: table } + } + #[allow(unused_mut)] + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + _args: &'args ListViewArgs, + ) -> flatbuffers::WIPOffset> { + let mut builder = ListViewBuilder::new(_fbb); + builder.finish() + } +} + +impl flatbuffers::Verifiable for ListView<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use flatbuffers::Verifiable; + v.visit_table(pos)?.finish(); + Ok(()) + } +} +pub struct ListViewArgs {} +impl<'a> Default for ListViewArgs { + #[inline] + fn default() -> Self { + ListViewArgs {} + } +} + +pub struct ListViewBuilder<'a: 'b, 'b> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, + start_: flatbuffers::WIPOffset, +} +impl<'a: 'b, 'b> ListViewBuilder<'a, 'b> { + #[inline] + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> ListViewBuilder<'a, 'b> { + let start = _fbb.start_table(); + ListViewBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + flatbuffers::WIPOffset::new(o.value()) + } +} + +impl core::fmt::Debug for ListView<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("ListView"); + ds.finish() + } +} +pub enum LargeListViewOffset {} +#[derive(Copy, Clone, PartialEq)] + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +pub struct LargeListView<'a> { + pub _tab: flatbuffers::Table<'a>, +} + +impl<'a> flatbuffers::Follow<'a> for LargeListView<'a> { + type Inner = LargeListView<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } +} + +impl<'a> LargeListView<'a> { + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + LargeListView { _tab: table } + } + #[allow(unused_mut)] + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + _args: &'args LargeListViewArgs, + ) -> flatbuffers::WIPOffset> { + let mut builder = LargeListViewBuilder::new(_fbb); + builder.finish() + } +} + +impl flatbuffers::Verifiable for LargeListView<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use flatbuffers::Verifiable; + v.visit_table(pos)?.finish(); + Ok(()) + } +} +pub struct LargeListViewArgs {} +impl<'a> Default for LargeListViewArgs { + #[inline] + fn default() -> Self { + LargeListViewArgs {} + } +} + +pub struct LargeListViewBuilder<'a: 'b, 'b> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, + start_: flatbuffers::WIPOffset, +} +impl<'a: 'b, 'b> LargeListViewBuilder<'a, 'b> { + #[inline] + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> LargeListViewBuilder<'a, 'b> { + let start = _fbb.start_table(); + LargeListViewBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + flatbuffers::WIPOffset::new(o.value()) + } +} + +impl core::fmt::Debug for LargeListView<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("LargeListView"); + ds.finish() + } +} pub enum FixedSizeListOffset {} #[derive(Copy, Clone, PartialEq)] @@ -2453,6 +2630,174 @@ impl core::fmt::Debug for LargeBinary<'_> { ds.finish() } } +pub enum Utf8ViewOffset {} +#[derive(Copy, Clone, PartialEq)] + +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +pub struct Utf8View<'a> { + pub _tab: flatbuffers::Table<'a>, +} + +impl<'a> flatbuffers::Follow<'a> for Utf8View<'a> { + type Inner = Utf8View<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } +} + +impl<'a> Utf8View<'a> { + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + Utf8View { _tab: table } + } + #[allow(unused_mut)] + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + _args: &'args Utf8ViewArgs, + ) -> flatbuffers::WIPOffset> { + let mut builder = Utf8ViewBuilder::new(_fbb); + builder.finish() + } +} + +impl flatbuffers::Verifiable for Utf8View<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use flatbuffers::Verifiable; + v.visit_table(pos)?.finish(); + Ok(()) + } +} +pub struct Utf8ViewArgs {} +impl<'a> Default for Utf8ViewArgs { + #[inline] + fn default() -> Self { + Utf8ViewArgs {} + } +} + +pub struct Utf8ViewBuilder<'a: 'b, 'b> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, + start_: flatbuffers::WIPOffset, +} +impl<'a: 'b, 'b> Utf8ViewBuilder<'a, 'b> { + #[inline] + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> Utf8ViewBuilder<'a, 'b> { + let start = _fbb.start_table(); + Utf8ViewBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + flatbuffers::WIPOffset::new(o.value()) + } +} + +impl core::fmt::Debug for Utf8View<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("Utf8View"); + ds.finish() + } +} +pub enum BinaryViewOffset {} +#[derive(Copy, Clone, PartialEq)] + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +pub struct BinaryView<'a> { + pub _tab: flatbuffers::Table<'a>, +} + +impl<'a> flatbuffers::Follow<'a> for BinaryView<'a> { + type Inner = BinaryView<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } +} + +impl<'a> BinaryView<'a> { + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + BinaryView { _tab: table } + } + #[allow(unused_mut)] + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + _args: &'args BinaryViewArgs, + ) -> flatbuffers::WIPOffset> { + let mut builder = BinaryViewBuilder::new(_fbb); + builder.finish() + } +} + +impl flatbuffers::Verifiable for BinaryView<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use flatbuffers::Verifiable; + v.visit_table(pos)?.finish(); + Ok(()) + } +} +pub struct BinaryViewArgs {} +impl<'a> Default for BinaryViewArgs { + #[inline] + fn default() -> Self { + BinaryViewArgs {} + } +} + +pub struct BinaryViewBuilder<'a: 'b, 'b> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, + start_: flatbuffers::WIPOffset, +} +impl<'a: 'b, 'b> BinaryViewBuilder<'a, 'b> { + #[inline] + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> BinaryViewBuilder<'a, 'b> { + let start = _fbb.start_table(); + BinaryViewBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + flatbuffers::WIPOffset::new(o.value()) + } +} + +impl core::fmt::Debug for BinaryView<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("BinaryView"); + ds.finish() + } +} pub enum FixedSizeBinaryOffset {} #[derive(Copy, Clone, PartialEq)] @@ -3213,7 +3558,7 @@ pub enum TimestampOffset {} /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values @@ -4365,6 +4710,66 @@ impl<'a> Field<'a> { None } } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_binary_view(&self) -> Option> { + if self.type_type() == Type::BinaryView { + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { BinaryView::init_from_table(t) } + }) + } else { + None + } + } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_utf_8_view(&self) -> Option> { + if self.type_type() == Type::Utf8View { + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Utf8View::init_from_table(t) } + }) + } else { + None + } + } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_list_view(&self) -> Option> { + if self.type_type() == Type::ListView { + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { ListView::init_from_table(t) } + }) + } else { + None + } + } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_large_list_view(&self) -> Option> { + if self.type_type() == Type::LargeListView { + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { LargeListView::init_from_table(t) } + }) + } else { + None + } + } } impl flatbuffers::Verifiable for Field<'_> { @@ -4484,6 +4889,26 @@ impl flatbuffers::Verifiable for Field<'_> { "Type::RunEndEncoded", pos, ), + Type::BinaryView => v + .verify_union_variant::>( + "Type::BinaryView", + pos, + ), + Type::Utf8View => v + .verify_union_variant::>( + "Type::Utf8View", + pos, + ), + Type::ListView => v + .verify_union_variant::>( + "Type::ListView", + pos, + ), + Type::LargeListView => v + .verify_union_variant::>( + "Type::LargeListView", + pos, + ), _ => Ok(()), }, )? @@ -4827,6 +5252,46 @@ impl core::fmt::Debug for Field<'_> { ) } } + Type::BinaryView => { + if let Some(x) = self.type_as_binary_view() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } + Type::Utf8View => { + if let Some(x) = self.type_as_utf_8_view() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } + Type::ListView => { + if let Some(x) = self.type_as_list_view() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } + Type::LargeListView => { + if let Some(x) = self.type_as_large_list_view() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } _ => { let x: Option<()> = None; ds.field("type_", &x) diff --git a/format/Message.fbs b/format/Message.fbs index 170ea8fbced8..c8c9b4b82cbf 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -99,6 +99,22 @@ table RecordBatch { /// Optional compression of the message body compression: BodyCompression; + + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + variadicBufferCounts: [long]; } /// For sending dictionary encoding information. Any Field can be @@ -138,4 +154,4 @@ table Message { custom_metadata: [ KeyValue ]; } -root_type Message; +root_type Message; \ No newline at end of file diff --git a/format/Schema.fbs b/format/Schema.fbs index 6337f72ec9de..ab726903d19f 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -20,8 +20,10 @@ /// Format Version History. /// Version 1.0 - Forward and backwards compatibility guaranteed. /// Version 1.1 - Add Decimal256. -/// Version 1.2 - Add Interval MONTH_DAY_NANO +/// Version 1.2 - Add Interval MONTH_DAY_NANO. /// Version 1.3 - Add Run-End Encoded. +/// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and +/// LargeListView. namespace org.apache.arrow.flatbuf; @@ -38,7 +40,7 @@ enum MetadataVersion:short { /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. V4, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// @@ -96,6 +98,17 @@ table List { table LargeList { } +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +table ListView { +} + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +table LargeListView { +} + table FixedSizeList { /// Number of list items per value listSize: int; @@ -171,6 +184,27 @@ table LargeUtf8 { table LargeBinary { } +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table Utf8View { +} + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table BinaryView { +} + + table FixedSizeBinary { /// Number of bytes per value byteWidth: int; @@ -338,7 +372,7 @@ table Time { /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values @@ -427,6 +461,10 @@ union Type { LargeUtf8, LargeList, RunEndEncoded, + BinaryView, + Utf8View, + ListView, + LargeListView, } /// ---------------------------------------------------------------------- @@ -529,4 +567,4 @@ table Schema { features : [ Feature ]; } -root_type Schema; +root_type Schema; \ No newline at end of file diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 4a78db05ed2d..300a21c4f133 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -32,8 +32,7 @@ use arrow_ipc::writer; use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ - ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, - Type as PhysicalType, + ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, Type as PhysicalType, }; use crate::errors::{ParquetError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; @@ -55,11 +54,7 @@ pub fn parquet_to_arrow_schema( parquet_schema: &SchemaDescriptor, key_value_metadata: Option<&Vec>, ) -> Result { - parquet_to_arrow_schema_by_columns( - parquet_schema, - ProjectionMask::all(), - key_value_metadata, - ) + parquet_to_arrow_schema_by_columns(parquet_schema, ProjectionMask::all(), key_value_metadata) } /// Convert parquet schema to arrow schema including optional metadata, @@ -199,10 +194,7 @@ fn encode_arrow_schema(schema: &Schema) -> String { /// Mutates writer metadata by storing the encoded Arrow schema. /// If there is an existing Arrow schema metadata, it is replaced. -pub(crate) fn add_encoded_arrow_schema_to_metadata( - schema: &Schema, - props: &mut WriterProperties, -) { +pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); let schema_kv = KeyValue { @@ -270,16 +262,15 @@ fn parse_key_value_metadata( /// Convert parquet column schema to arrow field. pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result { let field = complex::convert_type(&parquet_column.self_type_ptr())?; - let mut ret = Field::new( - parquet_column.name(), - field.arrow_type, - field.nullable, - ); + let mut ret = Field::new(parquet_column.name(), field.arrow_type, field.nullable); let basic_info = parquet_column.self_type().get_basic_info(); if basic_info.has_id() { let mut meta = HashMap::with_capacity(1); - meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string()); + meta.insert( + PARQUET_FIELD_ID_META_KEY.to_string(), + basic_info.id().to_string(), + ); ret.set_metadata(meta); } @@ -401,15 +392,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_ref().is_empty()), unit: match time_unit { TimeUnit::Second => unreachable!(), - TimeUnit::Millisecond => { - ParquetTimeUnit::MILLIS(Default::default()) - } - TimeUnit::Microsecond => { - ParquetTimeUnit::MICROS(Default::default()) - } - TimeUnit::Nanosecond => { - ParquetTimeUnit::NANOS(Default::default()) - } + TimeUnit::Millisecond => ParquetTimeUnit::MILLIS(Default::default()), + TimeUnit::Microsecond => ParquetTimeUnit::MICROS(Default::default()), + TimeUnit::Nanosecond => ParquetTimeUnit::NANOS(Default::default()), }, })) .with_repetition(repetition) @@ -457,9 +442,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .build(), - DataType::Duration(_) => { - Err(arrow_err!("Converting Duration to parquet not supported",)) - } + DataType::Duration(_) => Err(arrow_err!("Converting Duration to parquet not supported",)), DataType::Interval(_) => { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_converted_type(ConvertedType::INTERVAL) @@ -481,9 +464,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::BinaryView | DataType::Utf8View => unimplemented!("BinaryView/Utf8View not implemented"), - DataType::Decimal128(precision, scale) - | DataType::Decimal256(precision, scale) => { + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } + DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { // Decimal precision determines the Parquet physical type to use. // Following the: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal let (physical_type, length) = if *precision > 1 && *precision <= 9 { @@ -528,12 +512,12 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_id(id) .build() } - DataType::ListView(_) | DataType::LargeListView(_) => unimplemented!("ListView/LargeListView not implemented"), + DataType::ListView(_) | DataType::LargeListView(_) => { + unimplemented!("ListView/LargeListView not implemented") + } DataType::Struct(fields) => { if fields.is_empty() { - return Err( - arrow_err!("Parquet does not support writing empty structs",), - ); + return Err(arrow_err!("Parquet does not support writing empty structs",)); } // recursively convert children to types/nodes let fields = fields @@ -623,8 +607,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let arrow_fields = Fields::from(vec![ Field::new("boolean", DataType::Boolean, false), @@ -662,8 +645,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let arrow_fields = Fields::from(vec![ Field::new("decimal1", DataType::Decimal128(4, 2), false), @@ -689,8 +671,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let arrow_fields = Fields::from(vec![ Field::new("binary", DataType::Binary, false), @@ -711,8 +692,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let arrow_fields = Fields::from(vec![ Field::new("boolean", DataType::Boolean, false), @@ -720,12 +700,9 @@ mod tests { ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); - let converted_arrow_schema = parquet_to_arrow_schema_by_columns( - &parquet_schema, - ProjectionMask::all(), - None, - ) - .unwrap(); + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(&parquet_schema, ProjectionMask::all(), None) + .unwrap(); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -923,8 +900,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -1002,8 +978,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -1097,8 +1072,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -1115,8 +1089,7 @@ mod tests { Field::new("leaf1", DataType::Boolean, false), Field::new("leaf2", DataType::Int32, false), ]); - let group1_struct = - Field::new("group1", DataType::Struct(group1_fields), false); + let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1_struct); let leaf3_field = Field::new("leaf3", DataType::Int64, false); @@ -1135,8 +1108,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -1289,8 +1261,7 @@ mod tests { let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = - parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let converted_fields = converted_arrow_schema.fields(); assert_eq!(arrow_fields.len(), converted_fields.len()); @@ -1515,20 +1486,11 @@ mod tests { vec![ Field::new("bools", DataType::Boolean, false), Field::new("uint32", DataType::UInt32, false), - Field::new_list( - "int32", - Field::new("element", DataType::Int32, true), - false, - ), + Field::new_list("int32", Field::new("element", DataType::Int32, true), false), ], false, ), - Field::new_dictionary( - "dictionary_strings", - DataType::Int32, - DataType::Utf8, - false, - ), + Field::new_dictionary("dictionary_strings", DataType::Int32, DataType::Utf8, false), Field::new("decimal_int32", DataType::Decimal128(8, 2), false), Field::new("decimal_int64", DataType::Decimal128(16, 2), false), Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false), @@ -1613,10 +1575,8 @@ mod tests { let schema = Schema::new_with_metadata( vec![ - Field::new("c1", DataType::Utf8, false).with_metadata(meta(&[ - ("Key", "Foo"), - (PARQUET_FIELD_ID_META_KEY, "2"), - ])), + Field::new("c1", DataType::Utf8, false) + .with_metadata(meta(&[("Key", "Foo"), (PARQUET_FIELD_ID_META_KEY, "2")])), Field::new("c2", DataType::Binary, false), Field::new("c3", DataType::FixedSizeBinary(3), false), Field::new("c4", DataType::Boolean, false), @@ -1634,10 +1594,7 @@ mod tests { ), Field::new( "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".into()), - ), + DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())), false, ), Field::new( @@ -1649,10 +1606,8 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new_list( "c21", - Field::new("item", DataType::Boolean, true).with_metadata(meta(&[ - ("Key", "Bar"), - (PARQUET_FIELD_ID_META_KEY, "5"), - ])), + Field::new("item", DataType::Boolean, true) + .with_metadata(meta(&[("Key", "Bar"), (PARQUET_FIELD_ID_META_KEY, "5")])), false, ) .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "4")])), @@ -1702,10 +1657,7 @@ mod tests { // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( "c31", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, 123, true, @@ -1740,11 +1692,7 @@ mod tests { "c39", "key_value", Field::new("key", DataType::Utf8, false), - Field::new_list( - "value", - Field::new("element", DataType::Utf8, true), - true, - ), + Field::new_list("value", Field::new("element", DataType::Utf8, true), true), false, // fails to roundtrip keys_sorted true, ), @@ -1783,11 +1731,8 @@ mod tests { // write to an empty parquet file so that schema is serialized let file = tempfile::tempfile().unwrap(); - let writer = ArrowWriter::try_new( - file.try_clone().unwrap(), - Arc::new(schema.clone()), - None, - )?; + let writer = + ArrowWriter::try_new(file.try_clone().unwrap(), Arc::new(schema.clone()), None)?; writer.close()?; // read file back @@ -1846,33 +1791,23 @@ mod tests { }; let schema = Schema::new_with_metadata( vec![ - Field::new("c1", DataType::Utf8, true).with_metadata(meta(&[ - (PARQUET_FIELD_ID_META_KEY, "1"), - ])), - Field::new("c2", DataType::Utf8, true).with_metadata(meta(&[ - (PARQUET_FIELD_ID_META_KEY, "2"), - ])), + Field::new("c1", DataType::Utf8, true) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "1")])), + Field::new("c2", DataType::Utf8, true) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "2")])), ], HashMap::new(), ); - let writer = ArrowWriter::try_new( - vec![], - Arc::new(schema.clone()), - None, - )?; + let writer = ArrowWriter::try_new(vec![], Arc::new(schema.clone()), None)?; let parquet_bytes = writer.into_inner()?; - let reader = crate::file::reader::SerializedFileReader::new( - bytes::Bytes::from(parquet_bytes), - )?; + let reader = + crate::file::reader::SerializedFileReader::new(bytes::Bytes::from(parquet_bytes))?; let schema_descriptor = reader.metadata().file_metadata().schema_descr_ptr(); // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema - let arrow_schema = crate::arrow::parquet_to_arrow_schema( - &schema_descriptor, - None, - )?; + let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema)?; let parq_fields = parq_schema_descr.root_schema().get_fields(); @@ -1885,19 +1820,14 @@ mod tests { #[test] fn test_arrow_schema_roundtrip_lists() -> Result<()> { - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); + let metadata: HashMap = [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); let schema = Schema::new_with_metadata( vec![ - Field::new_list( - "c21", - Field::new("array", DataType::Boolean, true), - false, - ), + Field::new_list("c21", Field::new("array", DataType::Boolean, true), false), Field::new( "c22", DataType::FixedSizeList( @@ -1928,11 +1858,8 @@ mod tests { // write to an empty parquet file so that schema is serialized let file = tempfile::tempfile().unwrap(); - let writer = ArrowWriter::try_new( - file.try_clone().unwrap(), - Arc::new(schema.clone()), - None, - )?; + let writer = + ArrowWriter::try_new(file.try_clone().unwrap(), Arc::new(schema.clone()), None)?; writer.close()?; // read file back From ada986c7ec8f8fe4f94235c8aaeba4995392ee72 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 15 Mar 2024 20:30:08 +1300 Subject: [PATCH 08/11] Prepare arrow 51.0.0 (#5516) --- CHANGELOG-old.md | 141 ++++++++++++++++++ CHANGELOG.md | 248 +++++++++++++++---------------- Cargo.toml | 32 ++-- dev/release/update_change_log.sh | 4 +- 4 files changed, 281 insertions(+), 144 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 5df29b350b85..b86431397139 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,147 @@ # Historical Changelog +## [50.0.0](https://github.com/apache/arrow-rs/tree/50.0.0) (2024-01-08) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/49.0.0...50.0.0) + +**Breaking changes:** + +- Make regexp\_match take scalar pattern and flag [\#5245](https://github.com/apache/arrow-rs/pull/5245) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use Vec in ColumnReader \(\#5177\) [\#5193](https://github.com/apache/arrow-rs/pull/5193) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove SIMD Feature [\#5184](https://github.com/apache/arrow-rs/pull/5184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use Total Ordering for Aggregates and Refactor for Better Auto-Vectorization [\#5100](https://github.com/apache/arrow-rs/pull/5100) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Allow the `zip` compute function to operator on `Scalar` values via `Datum` [\#5086](https://github.com/apache/arrow-rs/pull/5086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Nathan-Fenner](https://github.com/Nathan-Fenner)) +- Improve C Data Interface and Add Integration Testing Entrypoints [\#5080](https://github.com/apache/arrow-rs/pull/5080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pitrou](https://github.com/pitrou)) +- Parquet: read/write f16 for Arrow [\#5003](https://github.com/apache/arrow-rs/pull/5003) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) + +**Implemented enhancements:** + +- Support get offsets or blocks info from arrow file. [\#5252](https://github.com/apache/arrow-rs/issues/5252) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make regexp\_match take scalar pattern and flag [\#5246](https://github.com/apache/arrow-rs/issues/5246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cannot access pen state website on arrow-row [\#5238](https://github.com/apache/arrow-rs/issues/5238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RecordBatch with\_schema's error message is hard to read [\#5227](https://github.com/apache/arrow-rs/issues/5227) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support cast between StructArray. [\#5219](https://github.com/apache/arrow-rs/issues/5219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove nightly-only simd feature and related code in ArrowNumericType [\#5185](https://github.com/apache/arrow-rs/issues/5185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use Vec instead of Slice in ColumnReader [\#5177](https://github.com/apache/arrow-rs/issues/5177) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Request to Memmap Arrow IPC files on disk [\#5153](https://github.com/apache/arrow-rs/issues/5153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- GenericColumnReader::read\_records Yields Truncated Records [\#5150](https://github.com/apache/arrow-rs/issues/5150) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Nested Schema Projection [\#5148](https://github.com/apache/arrow-rs/issues/5148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support specifying `quote` and `escape` in Csv `WriterBuilder` [\#5146](https://github.com/apache/arrow-rs/issues/5146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting of Float16 with other numeric types [\#5138](https://github.com/apache/arrow-rs/issues/5138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet: read parquet metadata with page index in async and with size hints [\#5129](https://github.com/apache/arrow-rs/issues/5129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Cast from floating/timestamp to timestamp/floating [\#5122](https://github.com/apache/arrow-rs/issues/5122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Casting List To/From LargeList in Cast Kernel [\#5113](https://github.com/apache/arrow-rs/issues/5113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Expose a path for converting `bytes::Bytes` into `arrow_buffer::Buffer` without copy [\#5104](https://github.com/apache/arrow-rs/issues/5104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- API inconsistency of ListBuilder make it hard to use as nested builder [\#5098](https://github.com/apache/arrow-rs/issues/5098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet: don't truncate min/max statistics for float16 and decimal when writing file [\#5075](https://github.com/apache/arrow-rs/issues/5075) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet: derive boundary order when writing columns [\#5074](https://github.com/apache/arrow-rs/issues/5074) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support new Arrow PyCapsule Interface for Python FFI [\#5067](https://github.com/apache/arrow-rs/issues/5067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `48.0.1 ` arrow patch release [\#5050](https://github.com/apache/arrow-rs/issues/5050) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Binary columns do not receive truncated statistics [\#5037](https://github.com/apache/arrow-rs/issues/5037) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Re-evaluate Explicit SIMD Aggregations [\#5032](https://github.com/apache/arrow-rs/issues/5032) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Min/Max Kernels Should Use Total Ordering [\#5031](https://github.com/apache/arrow-rs/issues/5031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow `zip` compute kernel to take `Scalar` / `Datum` [\#5011](https://github.com/apache/arrow-rs/issues/5011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Float16/Half-float logical type to Parquet [\#4986](https://github.com/apache/arrow-rs/issues/4986) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- feat: cast \(Large\)List to FixedSizeList [\#5081](https://github.com/apache/arrow-rs/pull/5081) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Update Parquet Encoding Documentation [\#5051](https://github.com/apache/arrow-rs/issues/5051) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- json schema inference can't handle null field turned into object field in subsequent rows [\#5215](https://github.com/apache/arrow-rs/issues/5215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Invalid trailing content after `Z` in timezone is ignored [\#5182](https://github.com/apache/arrow-rs/issues/5182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Take panics on a fixed size list array when given null indices [\#5169](https://github.com/apache/arrow-rs/issues/5169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- EnabledStatistics::Page does not take effect on ByteArrayEncoder [\#5162](https://github.com/apache/arrow-rs/issues/5162) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet: ColumnOrder not being written when writing parquet files [\#5152](https://github.com/apache/arrow-rs/issues/5152) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet: Interval columns shouldn't write min/max stats [\#5145](https://github.com/apache/arrow-rs/issues/5145) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- cast `Utf8` to decimal failure [\#5127](https://github.com/apache/arrow-rs/issues/5127) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- coerce\_primitive not honored when decoding from serde object [\#5095](https://github.com/apache/arrow-rs/issues/5095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Unsound MutableArrayData Constructor [\#5091](https://github.com/apache/arrow-rs/issues/5091) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RowGroupReader.get\_row\_iter\(\) fails with Path ColumnPath not found [\#5064](https://github.com/apache/arrow-rs/issues/5064) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- cast format 'yyyymmdd' to Date32 give a error [\#5044](https://github.com/apache/arrow-rs/issues/5044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Performance improvements:** + +- ArrowArrayStreamReader imports FFI\_ArrowSchema on each iteration [\#5103](https://github.com/apache/arrow-rs/issues/5103) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Working example of list\_flights with ObjectStore [\#5116](https://github.com/apache/arrow-rs/issues/5116) +- \(object\_store\) Error broken pipe on S3 multipart upload [\#5106](https://github.com/apache/arrow-rs/issues/5106) + +**Merged pull requests:** + +- Update parquet object\_store dependency to 0.9.0 [\#5290](https://github.com/apache/arrow-rs/pull/5290) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.75 to =1.0.76 [\#5289](https://github.com/apache/arrow-rs/pull/5289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Enable JS tests again [\#5287](https://github.com/apache/arrow-rs/pull/5287) ([domoritz](https://github.com/domoritz)) +- Update proc-macro2 requirement from =1.0.74 to =1.0.75 [\#5279](https://github.com/apache/arrow-rs/pull/5279) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.73 to =1.0.74 [\#5271](https://github.com/apache/arrow-rs/pull/5271) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.71 to =1.0.73 [\#5265](https://github.com/apache/arrow-rs/pull/5265) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update docs for datatypes [\#5260](https://github.com/apache/arrow-rs/pull/5260) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Don't suppress errors in ArrowArrayStreamReader [\#5256](https://github.com/apache/arrow-rs/pull/5256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add IPC FileDecoder [\#5249](https://github.com/apache/arrow-rs/pull/5249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- optimize the next function of ArrowArrayStreamReader [\#5248](https://github.com/apache/arrow-rs/pull/5248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- ci: Fail Miri CI on first failure [\#5243](https://github.com/apache/arrow-rs/pull/5243) ([Jefffrey](https://github.com/Jefffrey)) +- Remove 'unwrap' from Result [\#5241](https://github.com/apache/arrow-rs/pull/5241) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Update arrow-row docs URL [\#5239](https://github.com/apache/arrow-rs/pull/5239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([thomas-k-cameron](https://github.com/thomas-k-cameron)) +- Improve regexp kernels performance by avoiding cloning Regex [\#5235](https://github.com/apache/arrow-rs/pull/5235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update proc-macro2 requirement from =1.0.70 to =1.0.71 [\#5231](https://github.com/apache/arrow-rs/pull/5231) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Minor: Improve comments and errors for ArrowPredicate [\#5230](https://github.com/apache/arrow-rs/pull/5230) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Bump actions/upload-pages-artifact from 2 to 3 [\#5229](https://github.com/apache/arrow-rs/pull/5229) ([dependabot[bot]](https://github.com/apps/dependabot)) +- make with\_schema's error more readable [\#5228](https://github.com/apache/arrow-rs/pull/5228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([shuoli84](https://github.com/shuoli84)) +- Use `try_new` when casting between structs to propagate error [\#5226](https://github.com/apache/arrow-rs/pull/5226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat\(cast\): support cast between struct [\#5221](https://github.com/apache/arrow-rs/pull/5221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([my-vegetable-has-exploded](https://github.com/my-vegetable-has-exploded)) +- Add `entries` to `MapBuilder` to return both key and value array builders [\#5218](https://github.com/apache/arrow-rs/pull/5218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix\(json\): fix inferring object after field was null [\#5216](https://github.com/apache/arrow-rs/pull/5216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Support MapBuilder in make\_builder [\#5210](https://github.com/apache/arrow-rs/pull/5210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- impl `From>` for `ScalarBuffer` [\#5203](https://github.com/apache/arrow-rs/pull/5203) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- impl `From>` for `Buffer` [\#5202](https://github.com/apache/arrow-rs/pull/5202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- impl `From>` for `ScalarBuffer` [\#5201](https://github.com/apache/arrow-rs/pull/5201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- feat: Support quote and escape in Csv WriterBuilder [\#5196](https://github.com/apache/arrow-rs/pull/5196) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([my-vegetable-has-exploded](https://github.com/my-vegetable-has-exploded)) +- chore: simplify cast\_string\_to\_interval [\#5195](https://github.com/apache/arrow-rs/pull/5195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Clarify interval comparison behavior with documentation and tests [\#5192](https://github.com/apache/arrow-rs/pull/5192) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add `BooleanArray::into_parts` method [\#5191](https://github.com/apache/arrow-rs/pull/5191) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Fix deprecated note for `Buffer::from_raw_parts` [\#5190](https://github.com/apache/arrow-rs/pull/5190) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Fix: Ensure Timestamp Parsing Rejects Characters After 'Z [\#5189](https://github.com/apache/arrow-rs/pull/5189) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([razeghi71](https://github.com/razeghi71)) +- Simplify parquet statistics generation [\#5183](https://github.com/apache/arrow-rs/pull/5183) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Parquet: Ensure page statistics are written only when conifgured from the Arrow Writer [\#5181](https://github.com/apache/arrow-rs/pull/5181) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) +- Blockwise IO in IPC FileReader \(\#5153\) [\#5179](https://github.com/apache/arrow-rs/pull/5179) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Replace ScalarBuffer in Parquet with Vec \(\#1849\) \(\#5177\) [\#5178](https://github.com/apache/arrow-rs/pull/5178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Bump actions/setup-python from 4 to 5 [\#5175](https://github.com/apache/arrow-rs/pull/5175) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add `LargeListBuilder` to `make_builder` [\#5171](https://github.com/apache/arrow-rs/pull/5171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix: ensure take\_fixed\_size\_list can handle null indices [\#5170](https://github.com/apache/arrow-rs/pull/5170) ([westonpace](https://github.com/westonpace)) +- Removing redundant `as casts` in parquet [\#5168](https://github.com/apache/arrow-rs/pull/5168) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) +- Bump actions/labeler from 4.3.0 to 5.0.0 [\#5167](https://github.com/apache/arrow-rs/pull/5167) ([dependabot[bot]](https://github.com/apps/dependabot)) +- improve: make RunArray displayable [\#5166](https://github.com/apache/arrow-rs/pull/5166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yukkit](https://github.com/yukkit)) +- ci: Add cargo audit CI action [\#5160](https://github.com/apache/arrow-rs/pull/5160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Parquet: write column\_orders in FileMetaData [\#5158](https://github.com/apache/arrow-rs/pull/5158) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Adding `is_null` datatype shortcut method [\#5157](https://github.com/apache/arrow-rs/pull/5157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Parquet: don't truncate f16/decimal min/max stats [\#5154](https://github.com/apache/arrow-rs/pull/5154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Support nested schema projection \(\#5148\) [\#5149](https://github.com/apache/arrow-rs/pull/5149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Parquet: omit min/max for interval columns when writing stats [\#5147](https://github.com/apache/arrow-rs/pull/5147) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Deprecate Fields::remove and Schema::remove [\#5144](https://github.com/apache/arrow-rs/pull/5144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support casting of Float16 with other numeric types [\#5139](https://github.com/apache/arrow-rs/pull/5139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Parquet: Make `MetadataLoader` public [\#5137](https://github.com/apache/arrow-rs/pull/5137) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) +- Add FileReaderBuilder for arrow-ipc to allow reading large no. of column files [\#5136](https://github.com/apache/arrow-rs/pull/5136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Parquet: clear metadata and project fields of ParquetRecordBatchStream::schema [\#5135](https://github.com/apache/arrow-rs/pull/5135) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- JSON: write struct array nulls as null [\#5133](https://github.com/apache/arrow-rs/pull/5133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Update proc-macro2 requirement from =1.0.69 to =1.0.70 [\#5131](https://github.com/apache/arrow-rs/pull/5131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix negative decimal string [\#5128](https://github.com/apache/arrow-rs/pull/5128) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup list casting and support nested lists \(\#5113\) [\#5124](https://github.com/apache/arrow-rs/pull/5124) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cast from numeric/timestamp to timestamp/numeric [\#5123](https://github.com/apache/arrow-rs/pull/5123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve cast docs [\#5114](https://github.com/apache/arrow-rs/pull/5114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.12.2 to =0.12.3 [\#5112](https://github.com/apache/arrow-rs/pull/5112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Parquet: derive boundary order when writing [\#5110](https://github.com/apache/arrow-rs/pull/5110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Implementing `ArrayBuilder` for `Box` [\#5109](https://github.com/apache/arrow-rs/pull/5109) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix 'ColumnPath not found' error reading Parquet files with nested REPEATED fields [\#5102](https://github.com/apache/arrow-rs/pull/5102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) +- fix: coerce\_primitive for serde decoded data [\#5101](https://github.com/apache/arrow-rs/pull/5101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- Extend aggregation benchmarks [\#5096](https://github.com/apache/arrow-rs/pull/5096) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Expand parquet crate overview doc [\#5093](https://github.com/apache/arrow-rs/pull/5093) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) +- Ensure arrays passed to MutableArrayData have same type \(\#5091\) [\#5092](https://github.com/apache/arrow-rs/pull/5092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.12.1 to =0.12.2 [\#5088](https://github.com/apache/arrow-rs/pull/5088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add FFI from\_raw [\#5082](https://github.com/apache/arrow-rs/pull/5082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[fix \#5044\] Support converting 'yyyymmdd' format to date [\#5078](https://github.com/apache/arrow-rs/pull/5078) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Tangruilin](https://github.com/Tangruilin)) +- Enable truncation of binary statistics columns [\#5076](https://github.com/apache/arrow-rs/pull/5076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([emcake](https://github.com/emcake)) ## [49.0.0](https://github.com/apache/arrow-rs/tree/49.0.0) (2023-11-07) [Full Changelog](https://github.com/apache/arrow-rs/compare/48.0.0...49.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c173bfdeda0..2eac54afaf32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,148 +19,144 @@ # Changelog -## [50.0.0](https://github.com/apache/arrow-rs/tree/50.0.0) (2024-01-08) +## [51.0.0](https://github.com/apache/arrow-rs/tree/51.0.0) (2024-03-15) -[Full Changelog](https://github.com/apache/arrow-rs/compare/49.0.0...50.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/50.0.0...51.0.0) **Breaking changes:** -- Make regexp\_match take scalar pattern and flag [\#5245](https://github.com/apache/arrow-rs/pull/5245) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Use Vec in ColumnReader \(\#5177\) [\#5193](https://github.com/apache/arrow-rs/pull/5193) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove SIMD Feature [\#5184](https://github.com/apache/arrow-rs/pull/5184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use Total Ordering for Aggregates and Refactor for Better Auto-Vectorization [\#5100](https://github.com/apache/arrow-rs/pull/5100) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Allow the `zip` compute function to operator on `Scalar` values via `Datum` [\#5086](https://github.com/apache/arrow-rs/pull/5086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Nathan-Fenner](https://github.com/Nathan-Fenner)) -- Improve C Data Interface and Add Integration Testing Entrypoints [\#5080](https://github.com/apache/arrow-rs/pull/5080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pitrou](https://github.com/pitrou)) -- Parquet: read/write f16 for Arrow [\#5003](https://github.com/apache/arrow-rs/pull/5003) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Remove internal buffering from AsyncArrowWriter \(\#5484\) [\#5485](https://github.com/apache/arrow-rs/pull/5485) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make ArrayBuilder also Sync [\#5353](https://github.com/apache/arrow-rs/pull/5353) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dvic](https://github.com/dvic)) +- Raw JSON writer \(~10x faster\) \(\#5314\) [\#5318](https://github.com/apache/arrow-rs/pull/5318) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support get offsets or blocks info from arrow file. [\#5252](https://github.com/apache/arrow-rs/issues/5252) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make regexp\_match take scalar pattern and flag [\#5246](https://github.com/apache/arrow-rs/issues/5246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cannot access pen state website on arrow-row [\#5238](https://github.com/apache/arrow-rs/issues/5238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RecordBatch with\_schema's error message is hard to read [\#5227](https://github.com/apache/arrow-rs/issues/5227) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support cast between StructArray. [\#5219](https://github.com/apache/arrow-rs/issues/5219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove nightly-only simd feature and related code in ArrowNumericType [\#5185](https://github.com/apache/arrow-rs/issues/5185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use Vec instead of Slice in ColumnReader [\#5177](https://github.com/apache/arrow-rs/issues/5177) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Request to Memmap Arrow IPC files on disk [\#5153](https://github.com/apache/arrow-rs/issues/5153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- GenericColumnReader::read\_records Yields Truncated Records [\#5150](https://github.com/apache/arrow-rs/issues/5150) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Nested Schema Projection [\#5148](https://github.com/apache/arrow-rs/issues/5148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support specifying `quote` and `escape` in Csv `WriterBuilder` [\#5146](https://github.com/apache/arrow-rs/issues/5146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting of Float16 with other numeric types [\#5138](https://github.com/apache/arrow-rs/issues/5138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet: read parquet metadata with page index in async and with size hints [\#5129](https://github.com/apache/arrow-rs/issues/5129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Cast from floating/timestamp to timestamp/floating [\#5122](https://github.com/apache/arrow-rs/issues/5122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support Casting List To/From LargeList in Cast Kernel [\#5113](https://github.com/apache/arrow-rs/issues/5113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Expose a path for converting `bytes::Bytes` into `arrow_buffer::Buffer` without copy [\#5104](https://github.com/apache/arrow-rs/issues/5104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- API inconsistency of ListBuilder make it hard to use as nested builder [\#5098](https://github.com/apache/arrow-rs/issues/5098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet: don't truncate min/max statistics for float16 and decimal when writing file [\#5075](https://github.com/apache/arrow-rs/issues/5075) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet: derive boundary order when writing columns [\#5074](https://github.com/apache/arrow-rs/issues/5074) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support new Arrow PyCapsule Interface for Python FFI [\#5067](https://github.com/apache/arrow-rs/issues/5067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `48.0.1 ` arrow patch release [\#5050](https://github.com/apache/arrow-rs/issues/5050) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Binary columns do not receive truncated statistics [\#5037](https://github.com/apache/arrow-rs/issues/5037) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Re-evaluate Explicit SIMD Aggregations [\#5032](https://github.com/apache/arrow-rs/issues/5032) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Min/Max Kernels Should Use Total Ordering [\#5031](https://github.com/apache/arrow-rs/issues/5031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow `zip` compute kernel to take `Scalar` / `Datum` [\#5011](https://github.com/apache/arrow-rs/issues/5011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add Float16/Half-float logical type to Parquet [\#4986](https://github.com/apache/arrow-rs/issues/4986) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- feat: cast \(Large\)List to FixedSizeList [\#5081](https://github.com/apache/arrow-rs/pull/5081) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Update Parquet Encoding Documentation [\#5051](https://github.com/apache/arrow-rs/issues/5051) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Prototype Arrow over HTTP in Rust [\#5496](https://github.com/apache/arrow-rs/issues/5496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add DataType::ListView and DataType::LargeListView [\#5492](https://github.com/apache/arrow-rs/issues/5492) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve documentation around handling of dictionary arrays in arrow flight [\#5487](https://github.com/apache/arrow-rs/issues/5487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Better memory limiting in parquet `ArrowWriter` [\#5484](https://github.com/apache/arrow-rs/issues/5484) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support Creating Non-Nullable Lists and Maps within a Struct [\#5482](https://github.com/apache/arrow-rs/issues/5482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[DISCUSSION\] Better borrow propagation \(e.g. `RecordBatch::schema()` to return `&SchemaRef` vs `SchemaRef`\) [\#5463](https://github.com/apache/arrow-rs/issues/5463) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Build Scalar with ArrayRef [\#5459](https://github.com/apache/arrow-rs/issues/5459) +- AsyncArrowWriter doesn't limit underlying ArrowWriter to respect buffer-size [\#5450](https://github.com/apache/arrow-rs/issues/5450) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Refine `Display` implementation for `FlightError` [\#5438](https://github.com/apache/arrow-rs/issues/5438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Better ergonomics for `FixedSizeList` and `LargeList` [\#5372](https://github.com/apache/arrow-rs/issues/5372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update Flight proto [\#5367](https://github.com/apache/arrow-rs/issues/5367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support check similar datatype but with different magnitudes [\#5358](https://github.com/apache/arrow-rs/issues/5358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Buffer memory usage for custom allocations is reported as 0 [\#5346](https://github.com/apache/arrow-rs/issues/5346) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Can the ArrayBuilder trait be made Sync? [\#5344](https://github.com/apache/arrow-rs/issues/5344) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- support cast 'UTF8' to `FixedSizeList` [\#5339](https://github.com/apache/arrow-rs/issues/5339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Creating Non-Nullable Lists with ListBuilder [\#5330](https://github.com/apache/arrow-rs/issues/5330) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `ParquetRecordBatchStreamBuilder::new()` panics instead of erroring out when opening a corrupted file [\#5315](https://github.com/apache/arrow-rs/issues/5315) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Raw JSON Writer [\#5314](https://github.com/apache/arrow-rs/issues/5314) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for more fused boolean operations [\#5297](https://github.com/apache/arrow-rs/issues/5297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: Allow disabling embed `ARROW_SCHEMA_META_KEY` added by the `ArrowWriter` [\#5296](https://github.com/apache/arrow-rs/issues/5296) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support casting strings like '2001-01-01 01:01:01' to Date32 [\#5280](https://github.com/apache/arrow-rs/issues/5280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Temporal Extract/Date Part Kernel [\#5266](https://github.com/apache/arrow-rs/issues/5266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support for extracting hours/minutes/seconds/etc. from `Time32`/`Time64` type in temporal kernels [\#5261](https://github.com/apache/arrow-rs/issues/5261) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: add method to get both the inner writer and the file metadata when closing SerializedFileWriter [\#5253](https://github.com/apache/arrow-rs/issues/5253) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Release arrow-rs version 50.0.0 [\#5234](https://github.com/apache/arrow-rs/issues/5234) **Fixed bugs:** -- json schema inference can't handle null field turned into object field in subsequent rows [\#5215](https://github.com/apache/arrow-rs/issues/5215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Invalid trailing content after `Z` in timezone is ignored [\#5182](https://github.com/apache/arrow-rs/issues/5182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Take panics on a fixed size list array when given null indices [\#5169](https://github.com/apache/arrow-rs/issues/5169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- EnabledStatistics::Page does not take effect on ByteArrayEncoder [\#5162](https://github.com/apache/arrow-rs/issues/5162) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet: ColumnOrder not being written when writing parquet files [\#5152](https://github.com/apache/arrow-rs/issues/5152) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet: Interval columns shouldn't write min/max stats [\#5145](https://github.com/apache/arrow-rs/issues/5145) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- cast `Utf8` to decimal failure [\#5127](https://github.com/apache/arrow-rs/issues/5127) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- coerce\_primitive not honored when decoding from serde object [\#5095](https://github.com/apache/arrow-rs/issues/5095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Unsound MutableArrayData Constructor [\#5091](https://github.com/apache/arrow-rs/issues/5091) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RowGroupReader.get\_row\_iter\(\) fails with Path ColumnPath not found [\#5064](https://github.com/apache/arrow-rs/issues/5064) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- cast format 'yyyymmdd' to Date32 give a error [\#5044](https://github.com/apache/arrow-rs/issues/5044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty String Parses as Zero in Unreleased Arrow [\#5504](https://github.com/apache/arrow-rs/issues/5504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Unused import in nightly rust [\#5476](https://github.com/apache/arrow-rs/issues/5476) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Error `The data type type List .. has no natural order` when using `arrow::compute::lexsort_to_indices` with list and more than one column [\#5454](https://github.com/apache/arrow-rs/issues/5454) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Wrong size assertion in arrow\_buffer::builder::NullBufferBuilder::new\_from\_buffer [\#5445](https://github.com/apache/arrow-rs/issues/5445) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inconsistency between comments and code implementation [\#5430](https://github.com/apache/arrow-rs/issues/5430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- OOB access in `Buffer::from_iter` [\#5412](https://github.com/apache/arrow-rs/issues/5412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast kernel doesn't return null for string to integral cases when overflowing under safe option enabled [\#5397](https://github.com/apache/arrow-rs/issues/5397) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make ffi consume variable layout arrays with empty offsets [\#5391](https://github.com/apache/arrow-rs/issues/5391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RecordBatch conversion from pyarrow loses Schema's metadata [\#5354](https://github.com/apache/arrow-rs/issues/5354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Debug output of Time32/Time64 arrays with invalid values has confusing nulls [\#5336](https://github.com/apache/arrow-rs/issues/5336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Removing a column from a `RecordBatch` drops schema metadata [\#5327](https://github.com/apache/arrow-rs/issues/5327) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Panic when read an empty parquet file [\#5304](https://github.com/apache/arrow-rs/issues/5304) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- How to enable statistics for string columns? [\#5270](https://github.com/apache/arrow-rs/issues/5270) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `concat::tests::test_string_dictionary_merge failure` fails on Mac / has different results in different platforms [\#5255](https://github.com/apache/arrow-rs/issues/5255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Minor: Add doc comments to `GenericByteViewArray` [\#5512](https://github.com/apache/arrow-rs/pull/5512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve docs for logical and physical nulls even more [\#5434](https://github.com/apache/arrow-rs/pull/5434) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add example of converting RecordBatches to JSON objects [\#5364](https://github.com/apache/arrow-rs/pull/5364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) **Performance improvements:** -- ArrowArrayStreamReader imports FFI\_ArrowSchema on each iteration [\#5103](https://github.com/apache/arrow-rs/issues/5103) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- improve float to string cast by ~20%-40% [\#5401](https://github.com/apache/arrow-rs/pull/5401) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) **Closed issues:** -- Working example of list\_flights with ObjectStore [\#5116](https://github.com/apache/arrow-rs/issues/5116) -- \(object\_store\) Error broken pipe on S3 multipart upload [\#5106](https://github.com/apache/arrow-rs/issues/5106) +- Add `StringViewArray` implementation and layout and basic construction + tests [\#5469](https://github.com/apache/arrow-rs/issues/5469) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DataType::Utf8View` and `DataType::BinaryView` [\#5468](https://github.com/apache/arrow-rs/issues/5468) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Update parquet object\_store dependency to 0.9.0 [\#5290](https://github.com/apache/arrow-rs/pull/5290) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.75 to =1.0.76 [\#5289](https://github.com/apache/arrow-rs/pull/5289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Enable JS tests again [\#5287](https://github.com/apache/arrow-rs/pull/5287) ([domoritz](https://github.com/domoritz)) -- Update proc-macro2 requirement from =1.0.74 to =1.0.75 [\#5279](https://github.com/apache/arrow-rs/pull/5279) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update proc-macro2 requirement from =1.0.73 to =1.0.74 [\#5271](https://github.com/apache/arrow-rs/pull/5271) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update proc-macro2 requirement from =1.0.71 to =1.0.73 [\#5265](https://github.com/apache/arrow-rs/pull/5265) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update docs for datatypes [\#5260](https://github.com/apache/arrow-rs/pull/5260) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Don't suppress errors in ArrowArrayStreamReader [\#5256](https://github.com/apache/arrow-rs/pull/5256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add IPC FileDecoder [\#5249](https://github.com/apache/arrow-rs/pull/5249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- optimize the next function of ArrowArrayStreamReader [\#5248](https://github.com/apache/arrow-rs/pull/5248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) -- ci: Fail Miri CI on first failure [\#5243](https://github.com/apache/arrow-rs/pull/5243) ([Jefffrey](https://github.com/Jefffrey)) -- Remove 'unwrap' from Result [\#5241](https://github.com/apache/arrow-rs/pull/5241) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Update arrow-row docs URL [\#5239](https://github.com/apache/arrow-rs/pull/5239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([thomas-k-cameron](https://github.com/thomas-k-cameron)) -- Improve regexp kernels performance by avoiding cloning Regex [\#5235](https://github.com/apache/arrow-rs/pull/5235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update proc-macro2 requirement from =1.0.70 to =1.0.71 [\#5231](https://github.com/apache/arrow-rs/pull/5231) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Minor: Improve comments and errors for ArrowPredicate [\#5230](https://github.com/apache/arrow-rs/pull/5230) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Bump actions/upload-pages-artifact from 2 to 3 [\#5229](https://github.com/apache/arrow-rs/pull/5229) ([dependabot[bot]](https://github.com/apps/dependabot)) -- make with\_schema's error more readable [\#5228](https://github.com/apache/arrow-rs/pull/5228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([shuoli84](https://github.com/shuoli84)) -- Use `try_new` when casting between structs to propagate error [\#5226](https://github.com/apache/arrow-rs/pull/5226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- feat\(cast\): support cast between struct [\#5221](https://github.com/apache/arrow-rs/pull/5221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([my-vegetable-has-exploded](https://github.com/my-vegetable-has-exploded)) -- Add `entries` to `MapBuilder` to return both key and value array builders [\#5218](https://github.com/apache/arrow-rs/pull/5218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- fix\(json\): fix inferring object after field was null [\#5216](https://github.com/apache/arrow-rs/pull/5216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) -- Support MapBuilder in make\_builder [\#5210](https://github.com/apache/arrow-rs/pull/5210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- impl `From>` for `ScalarBuffer` [\#5203](https://github.com/apache/arrow-rs/pull/5203) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- impl `From>` for `Buffer` [\#5202](https://github.com/apache/arrow-rs/pull/5202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- impl `From>` for `ScalarBuffer` [\#5201](https://github.com/apache/arrow-rs/pull/5201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- feat: Support quote and escape in Csv WriterBuilder [\#5196](https://github.com/apache/arrow-rs/pull/5196) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([my-vegetable-has-exploded](https://github.com/my-vegetable-has-exploded)) -- chore: simplify cast\_string\_to\_interval [\#5195](https://github.com/apache/arrow-rs/pull/5195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Clarify interval comparison behavior with documentation and tests [\#5192](https://github.com/apache/arrow-rs/pull/5192) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add `BooleanArray::into_parts` method [\#5191](https://github.com/apache/arrow-rs/pull/5191) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Fix deprecated note for `Buffer::from_raw_parts` [\#5190](https://github.com/apache/arrow-rs/pull/5190) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Fix: Ensure Timestamp Parsing Rejects Characters After 'Z [\#5189](https://github.com/apache/arrow-rs/pull/5189) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([razeghi71](https://github.com/razeghi71)) -- Simplify parquet statistics generation [\#5183](https://github.com/apache/arrow-rs/pull/5183) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Parquet: Ensure page statistics are written only when conifgured from the Arrow Writer [\#5181](https://github.com/apache/arrow-rs/pull/5181) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) -- Blockwise IO in IPC FileReader \(\#5153\) [\#5179](https://github.com/apache/arrow-rs/pull/5179) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Replace ScalarBuffer in Parquet with Vec \(\#1849\) \(\#5177\) [\#5178](https://github.com/apache/arrow-rs/pull/5178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Bump actions/setup-python from 4 to 5 [\#5175](https://github.com/apache/arrow-rs/pull/5175) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add `LargeListBuilder` to `make_builder` [\#5171](https://github.com/apache/arrow-rs/pull/5171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- fix: ensure take\_fixed\_size\_list can handle null indices [\#5170](https://github.com/apache/arrow-rs/pull/5170) ([westonpace](https://github.com/westonpace)) -- Removing redundant `as casts` in parquet [\#5168](https://github.com/apache/arrow-rs/pull/5168) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) -- Bump actions/labeler from 4.3.0 to 5.0.0 [\#5167](https://github.com/apache/arrow-rs/pull/5167) ([dependabot[bot]](https://github.com/apps/dependabot)) -- improve: make RunArray displayable [\#5166](https://github.com/apache/arrow-rs/pull/5166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yukkit](https://github.com/yukkit)) -- ci: Add cargo audit CI action [\#5160](https://github.com/apache/arrow-rs/pull/5160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Parquet: write column\_orders in FileMetaData [\#5158](https://github.com/apache/arrow-rs/pull/5158) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- Adding `is_null` datatype shortcut method [\#5157](https://github.com/apache/arrow-rs/pull/5157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Parquet: don't truncate f16/decimal min/max stats [\#5154](https://github.com/apache/arrow-rs/pull/5154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- Support nested schema projection \(\#5148\) [\#5149](https://github.com/apache/arrow-rs/pull/5149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Parquet: omit min/max for interval columns when writing stats [\#5147](https://github.com/apache/arrow-rs/pull/5147) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- Deprecate Fields::remove and Schema::remove [\#5144](https://github.com/apache/arrow-rs/pull/5144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support casting of Float16 with other numeric types [\#5139](https://github.com/apache/arrow-rs/pull/5139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Parquet: Make `MetadataLoader` public [\#5137](https://github.com/apache/arrow-rs/pull/5137) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) -- Add FileReaderBuilder for arrow-ipc to allow reading large no. of column files [\#5136](https://github.com/apache/arrow-rs/pull/5136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Parquet: clear metadata and project fields of ParquetRecordBatchStream::schema [\#5135](https://github.com/apache/arrow-rs/pull/5135) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- JSON: write struct array nulls as null [\#5133](https://github.com/apache/arrow-rs/pull/5133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Update proc-macro2 requirement from =1.0.69 to =1.0.70 [\#5131](https://github.com/apache/arrow-rs/pull/5131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix negative decimal string [\#5128](https://github.com/apache/arrow-rs/pull/5128) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cleanup list casting and support nested lists \(\#5113\) [\#5124](https://github.com/apache/arrow-rs/pull/5124) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cast from numeric/timestamp to timestamp/numeric [\#5123](https://github.com/apache/arrow-rs/pull/5123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve cast docs [\#5114](https://github.com/apache/arrow-rs/pull/5114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update prost-build requirement from =0.12.2 to =0.12.3 [\#5112](https://github.com/apache/arrow-rs/pull/5112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Parquet: derive boundary order when writing [\#5110](https://github.com/apache/arrow-rs/pull/5110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- Implementing `ArrayBuilder` for `Box` [\#5109](https://github.com/apache/arrow-rs/pull/5109) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix 'ColumnPath not found' error reading Parquet files with nested REPEATED fields [\#5102](https://github.com/apache/arrow-rs/pull/5102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) -- fix: coerce\_primitive for serde decoded data [\#5101](https://github.com/apache/arrow-rs/pull/5101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) -- Extend aggregation benchmarks [\#5096](https://github.com/apache/arrow-rs/pull/5096) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Expand parquet crate overview doc [\#5093](https://github.com/apache/arrow-rs/pull/5093) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) -- Ensure arrays passed to MutableArrayData have same type \(\#5091\) [\#5092](https://github.com/apache/arrow-rs/pull/5092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update prost-build requirement from =0.12.1 to =0.12.2 [\#5088](https://github.com/apache/arrow-rs/pull/5088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add FFI from\_raw [\#5082](https://github.com/apache/arrow-rs/pull/5082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- \[fix \#5044\] Support converting 'yyyymmdd' format to date [\#5078](https://github.com/apache/arrow-rs/pull/5078) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Tangruilin](https://github.com/Tangruilin)) -- Enable truncation of binary statistics columns [\#5076](https://github.com/apache/arrow-rs/pull/5076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([emcake](https://github.com/emcake)) -- IPC writer truncated sliced list/map values [\#5071](https://github.com/apache/arrow-rs/pull/5071) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Implement Arrow PyCapsule Interface [\#5070](https://github.com/apache/arrow-rs/pull/5070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kylebarron](https://github.com/kylebarron)) -- Remove ByteBufferPtr and replace with Bytes [\#5055](https://github.com/apache/arrow-rs/pull/5055) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) -- Support multiple GZip members in parquet page [\#4951](https://github.com/apache/arrow-rs/pull/4951) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Deprecate array\_to\_json\_array [\#5515](https://github.com/apache/arrow-rs/pull/5515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix integer parsing of empty strings \(\#5504\) [\#5505](https://github.com/apache/arrow-rs/pull/5505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: clarifying comments in struct\_builder.rs \#5494 [\#5499](https://github.com/apache/arrow-rs/pull/5499) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([istvan-fodor](https://github.com/istvan-fodor)) +- Update proc-macro2 requirement from =1.0.78 to =1.0.79 [\#5498](https://github.com/apache/arrow-rs/pull/5498) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add DataType::ListView and DataType::LargeListView [\#5493](https://github.com/apache/arrow-rs/pull/5493) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon)) +- Better document parquet pushdown [\#5491](https://github.com/apache/arrow-rs/pull/5491) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix NullBufferBuilder::new\_from\_buffer wrong size assertion [\#5489](https://github.com/apache/arrow-rs/pull/5489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon)) +- Support dictionary encoding in structures for `FlightDataEncoder`, add documentation for `arrow_flight::encode::Dictionary` [\#5488](https://github.com/apache/arrow-rs/pull/5488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Add MapBuilder::with\_values\_field to support non-nullable values \(\#5482\) [\#5483](https://github.com/apache/arrow-rs/pull/5483) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lasantosr](https://github.com/lasantosr)) +- feat: initial support string\_view and binary\_view, supports layout and basic construction + tests [\#5481](https://github.com/apache/arrow-rs/pull/5481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ariesdevil](https://github.com/ariesdevil)) +- Add more comprehensive documentation on testing and benchmarking to CONTRIBUTING.md [\#5478](https://github.com/apache/arrow-rs/pull/5478) ([monkwire](https://github.com/monkwire)) +- Remove unused import detected by nightly rust [\#5477](https://github.com/apache/arrow-rs/pull/5477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Add RecordBatch::schema\_ref [\#5474](https://github.com/apache/arrow-rs/pull/5474) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([monkwire](https://github.com/monkwire)) +- Provide access to inner Write for parquet writers [\#5471](https://github.com/apache/arrow-rs/pull/5471) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add DataType::Utf8View and DataType::BinaryView [\#5470](https://github.com/apache/arrow-rs/pull/5470) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Update base64 requirement from 0.21 to 0.22 [\#5467](https://github.com/apache/arrow-rs/pull/5467) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Minor: Fix formatting typo in `Field::new_list_field` [\#5464](https://github.com/apache/arrow-rs/pull/5464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix test\_string\_dictionary\_merge \(\#5255\) [\#5461](https://github.com/apache/arrow-rs/pull/5461) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use Vec::from\_iter in Buffer::from\_iter [\#5460](https://github.com/apache/arrow-rs/pull/5460) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon)) +- Document parquet writer memory limiting \(\#5450\) [\#5457](https://github.com/apache/arrow-rs/pull/5457) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Document UnionArray Panics [\#5456](https://github.com/apache/arrow-rs/pull/5456) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon)) +- fix: lexsort\_to\_indices unsupported mixed types with list [\#5455](https://github.com/apache/arrow-rs/pull/5455) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Refine `Display` and `Source` implementation for error types [\#5439](https://github.com/apache/arrow-rs/pull/5439) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([BugenZhao](https://github.com/BugenZhao)) +- Improve debug output of Time32/Time64 arrays [\#5428](https://github.com/apache/arrow-rs/pull/5428) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([monkwire](https://github.com/monkwire)) +- Miri fix: Rename invalid\_mut to without\_provenance\_mut [\#5418](https://github.com/apache/arrow-rs/pull/5418) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Ensure addition/multiplications in when allocating buffers don't overflow [\#5417](https://github.com/apache/arrow-rs/pull/5417) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Update Flight proto: PollFlightInfo & expiration time [\#5413](https://github.com/apache/arrow-rs/pull/5413) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Jefffrey](https://github.com/Jefffrey)) +- Add tests for serializing lists of dictionary encoded values to json [\#5399](https://github.com/apache/arrow-rs/pull/5399) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Return null for overflow when casting string to integer under safe option enabled [\#5398](https://github.com/apache/arrow-rs/pull/5398) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Propagate error instead of panic for `take_bytes` [\#5395](https://github.com/apache/arrow-rs/pull/5395) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve like kernel by ~2% [\#5390](https://github.com/apache/arrow-rs/pull/5390) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Enable running arrow-array and arrow-arith with miri and avoid strict provenance warning [\#5387](https://github.com/apache/arrow-rs/pull/5387) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Update to chrono 0.4.34 [\#5385](https://github.com/apache/arrow-rs/pull/5385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return error instead of panic when reading invalid Parquet metadata [\#5382](https://github.com/apache/arrow-rs/pull/5382) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) +- Update tonic requirement from 0.10.0 to 0.11.0 [\#5380](https://github.com/apache/arrow-rs/pull/5380) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update tonic-build requirement from =0.10.2 to =0.11.0 [\#5379](https://github.com/apache/arrow-rs/pull/5379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix latest clippy lints [\#5376](https://github.com/apache/arrow-rs/pull/5376) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: utility functions for creating `FixedSizeList` and `LargeList` dtypes [\#5373](https://github.com/apache/arrow-rs/pull/5373) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([universalmind303](https://github.com/universalmind303)) +- Minor\(docs\): update master to main for DataFusion/Ballista [\#5363](https://github.com/apache/arrow-rs/pull/5363) ([caicancai](https://github.com/caicancai)) +- Return an error instead of a panic when reading a corrupted Parquet file with mismatched column counts [\#5362](https://github.com/apache/arrow-rs/pull/5362) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mmaitre314](https://github.com/mmaitre314)) +- feat: support casting FixedSizeList with new child type [\#5360](https://github.com/apache/arrow-rs/pull/5360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Add more debugging info to StructBuilder validate\_content [\#5357](https://github.com/apache/arrow-rs/pull/5357) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- pyarrow: Preserve RecordBatch's schema metadata [\#5355](https://github.com/apache/arrow-rs/pull/5355) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([atwam](https://github.com/atwam)) +- Mark Encoding::BIT\_PACKED as deprecated and document its compatibility issues [\#5348](https://github.com/apache/arrow-rs/pull/5348) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jhorstmann](https://github.com/jhorstmann)) +- Track the size of custom allocations for use via Array::get\_buffer\_memory\_size [\#5347](https://github.com/apache/arrow-rs/pull/5347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- fix: Return an error on type mismatch rather than panic \(\#4995\) [\#5341](https://github.com/apache/arrow-rs/pull/5341) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([carols10cents](https://github.com/carols10cents)) +- Minor: support cast values to fixedsizelist [\#5340](https://github.com/apache/arrow-rs/pull/5340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Enhance Time32/Time64 support in date\_part [\#5337](https://github.com/apache/arrow-rs/pull/5337) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- feat: add `take_record_batch`. [\#5333](https://github.com/apache/arrow-rs/pull/5333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RinChanNOWWW](https://github.com/RinChanNOWWW)) +- Add ListBuilder::with\_field to support non nullable list fields \(\#5330\) [\#5331](https://github.com/apache/arrow-rs/pull/5331) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't omit schema metadata when removing column [\#5328](https://github.com/apache/arrow-rs/pull/5328) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kylebarron](https://github.com/kylebarron)) +- Update proc-macro2 requirement from =1.0.76 to =1.0.78 [\#5324](https://github.com/apache/arrow-rs/pull/5324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Enhance Date64 type documentation [\#5323](https://github.com/apache/arrow-rs/pull/5323) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- fix panic when decode a group with no child [\#5322](https://github.com/apache/arrow-rs/pull/5322) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Liyixin95](https://github.com/Liyixin95)) +- Minor/Doc Expand FlightSqlServiceClient::handshake doc [\#5321](https://github.com/apache/arrow-rs/pull/5321) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([devinjdangelo](https://github.com/devinjdangelo)) +- Refactor temporal extract date part kernels [\#5319](https://github.com/apache/arrow-rs/pull/5319) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Add JSON writer benchmarks \(\#5314\) [\#5317](https://github.com/apache/arrow-rs/pull/5317) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Bump actions/cache from 3 to 4 [\#5308](https://github.com/apache/arrow-rs/pull/5308) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Avro block decompression [\#5306](https://github.com/apache/arrow-rs/pull/5306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Result into error in case of endianness mismatches [\#5301](https://github.com/apache/arrow-rs/pull/5301) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pangiole](https://github.com/pangiole)) +- parquet: Add ArrowWriterOptions to skip embedding the arrow metadata [\#5299](https://github.com/apache/arrow-rs/pull/5299) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([evenyag](https://github.com/evenyag)) +- Add support for more fused boolean operations [\#5298](https://github.com/apache/arrow-rs/pull/5298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RTEnzyme](https://github.com/RTEnzyme)) +- Support Parquet Byte Stream Split Encoding [\#5293](https://github.com/apache/arrow-rs/pull/5293) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mwlon](https://github.com/mwlon)) +- Extend string parsing support for Date32 [\#5282](https://github.com/apache/arrow-rs/pull/5282) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Bring some methods over from ArrowWriter to the async version [\#5251](https://github.com/apache/arrow-rs/pull/5251) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index bbbf907ba7f0..e09660941d60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "50.0.0" +version = "51.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "50.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "50.0.0", path = "./arrow-arith" } -arrow-array = { version = "50.0.0", path = "./arrow-array" } -arrow-buffer = { version = "50.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "50.0.0", path = "./arrow-cast" } -arrow-csv = { version = "50.0.0", path = "./arrow-csv" } -arrow-data = { version = "50.0.0", path = "./arrow-data" } -arrow-ipc = { version = "50.0.0", path = "./arrow-ipc" } -arrow-json = { version = "50.0.0", path = "./arrow-json" } -arrow-ord = { version = "50.0.0", path = "./arrow-ord" } -arrow-row = { version = "50.0.0", path = "./arrow-row" } -arrow-schema = { version = "50.0.0", path = "./arrow-schema" } -arrow-select = { version = "50.0.0", path = "./arrow-select" } -arrow-string = { version = "50.0.0", path = "./arrow-string" } -parquet = { version = "50.0.0", path = "./parquet", default-features = false } +arrow = { version = "51.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "51.0.0", path = "./arrow-arith" } +arrow-array = { version = "51.0.0", path = "./arrow-array" } +arrow-buffer = { version = "51.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "51.0.0", path = "./arrow-cast" } +arrow-csv = { version = "51.0.0", path = "./arrow-csv" } +arrow-data = { version = "51.0.0", path = "./arrow-data" } +arrow-ipc = { version = "51.0.0", path = "./arrow-ipc" } +arrow-json = { version = "51.0.0", path = "./arrow-json" } +arrow-ord = { version = "51.0.0", path = "./arrow-ord" } +arrow-row = { version = "51.0.0", path = "./arrow-row" } +arrow-schema = { version = "51.0.0", path = "./arrow-schema" } +arrow-select = { version = "51.0.0", path = "./arrow-select" } +arrow-string = { version = "51.0.0", path = "./arrow-string" } +parquet = { version = "51.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index e8590ddd6788..20d6d55615b1 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="49.0.0" -FUTURE_RELEASE="50.0.0" +SINCE_TAG="50.0.0" +FUTURE_RELEASE="51.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From ae42b3b6883d79ff25537e4b1837a9d9bfb0b088 Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Sat, 16 Mar 2024 23:14:50 +0100 Subject: [PATCH 09/11] parquet: Use specific error variant when codec is disabled (#5521) * Use specific error variant when codec is disabled Instead of reporting it as 'not yet implemented' * Replace Disabled with General --- parquet/src/compression.rs | 47 ++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 89f4b64d48b5..10560210e4e8 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -145,21 +145,40 @@ pub(crate) trait CompressionLevel { /// bytes for the compression type. /// This returns `None` if the codec type is `UNCOMPRESSED`. pub fn create_codec(codec: CodecType, _options: &CodecOptions) -> Result>> { + #[allow(unreachable_code, unused_variables)] match codec { - #[cfg(any(feature = "brotli", test))] - CodecType::BROTLI(level) => Ok(Some(Box::new(BrotliCodec::new(level)))), - #[cfg(any(feature = "flate2", test))] - CodecType::GZIP(level) => Ok(Some(Box::new(GZipCodec::new(level)))), - #[cfg(any(feature = "snap", test))] - CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), - #[cfg(any(feature = "lz4", test))] - CodecType::LZ4 => Ok(Some(Box::new(LZ4HadoopCodec::new( - _options.backward_compatible_lz4, - )))), - #[cfg(any(feature = "zstd", test))] - CodecType::ZSTD(level) => Ok(Some(Box::new(ZSTDCodec::new(level)))), - #[cfg(any(feature = "lz4", test))] - CodecType::LZ4_RAW => Ok(Some(Box::new(LZ4RawCodec::new()))), + CodecType::BROTLI(level) => { + #[cfg(any(feature = "brotli", test))] + return Ok(Some(Box::new(BrotliCodec::new(level)))); + Err(ParquetError::General("Disabled feature at compile time: brotli".into())) + }, + CodecType::GZIP(level) => { + #[cfg(any(feature = "flate2", test))] + return Ok(Some(Box::new(GZipCodec::new(level)))); + Err(ParquetError::General("Disabled feature at compile time: flate2".into())) + }, + CodecType::SNAPPY => { + #[cfg(any(feature = "snap", test))] + return Ok(Some(Box::new(SnappyCodec::new()))); + Err(ParquetError::General("Disabled feature at compile time: snap".into())) + }, + CodecType::LZ4 => { + #[cfg(any(feature = "lz4", test))] + return Ok(Some(Box::new(LZ4HadoopCodec::new( + _options.backward_compatible_lz4, + )))); + Err(ParquetError::General("Disabled feature at compile time: lz4".into())) + }, + CodecType::ZSTD(level) => { + #[cfg(any(feature = "zstd", test))] + return Ok(Some(Box::new(ZSTDCodec::new(level)))); + Err(ParquetError::General("Disabled feature at compile time: zstd".into())) + }, + CodecType::LZ4_RAW => { + #[cfg(any(feature = "lz4", test))] + return Ok(Some(Box::new(LZ4RawCodec::new()))); + Err(ParquetError::General("Disabled feature at compile time: lz4".into())) + }, CodecType::UNCOMPRESSED => Ok(None), _ => Err(nyi_err!("The codec type {} is not supported yet", codec)), } From 7e5f523a17444a3da01e87e9d1778315295065e9 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Sat, 16 Mar 2024 23:15:13 +0100 Subject: [PATCH 10/11] impl `From>` for `Vec` (#5518) * impl `From>` for `Vec` * Remove layout test, prevented by `miri` --- arrow-buffer/src/buffer/scalar.rs | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 3826d74e43bd..2019cc79830d 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -160,6 +160,15 @@ impl From> for ScalarBuffer { } } +impl From> for Vec { + fn from(value: ScalarBuffer) -> Self { + value + .buffer + .into_vec() + .unwrap_or_else(|buffer| buffer.typed_data::().into()) + } +} + impl From> for ScalarBuffer { fn from(mut value: BufferBuilder) -> Self { let len = value.len(); @@ -208,6 +217,8 @@ impl PartialEq> for Vec { #[cfg(test)] mod tests { + use std::{ptr::NonNull, sync::Arc}; + use super::*; #[test] @@ -284,4 +295,45 @@ mod tests { let scalar_buffer = ScalarBuffer::from(buffer_builder); assert_eq!(scalar_buffer.as_ref(), input); } + + #[test] + fn into_vec() { + let input = vec![1u8, 2, 3, 4]; + + // No copy + let input_buffer = Buffer::from_vec(input.clone()); + let input_ptr = input_buffer.as_ptr(); + let input_len = input_buffer.len(); + let scalar_buffer = ScalarBuffer::::new(input_buffer, 0, input_len); + let vec = Vec::from(scalar_buffer); + assert_eq!(vec.as_slice(), input.as_slice()); + assert_eq!(vec.as_ptr(), input_ptr); + + // Custom allocation - makes a copy + let mut input_clone = input.clone(); + let input_ptr = NonNull::new(input_clone.as_mut_ptr()).unwrap(); + let dealloc = Arc::new(()); + let buffer = + unsafe { Buffer::from_custom_allocation(input_ptr, input_clone.len(), dealloc as _) }; + let scalar_buffer = ScalarBuffer::::new(buffer, 0, input.len()); + let vec = Vec::from(scalar_buffer); + assert_eq!(vec, input.as_slice()); + assert_ne!(vec.as_ptr(), input_ptr.as_ptr()); + + // Offset - makes a copy + let input_buffer = Buffer::from_vec(input.clone()); + let input_ptr = input_buffer.as_ptr(); + let input_len = input_buffer.len(); + let scalar_buffer = ScalarBuffer::::new(input_buffer, 1, input_len - 1); + let vec = Vec::from(scalar_buffer); + assert_eq!(vec.as_slice(), &input[1..]); + assert_ne!(vec.as_ptr(), input_ptr); + + // Inner buffer Arc ref count != 0 - makes a copy + let buffer = Buffer::from_slice_ref(input.as_slice()); + let scalar_buffer = ScalarBuffer::::new(buffer, 0, input.len()); + let vec = Vec::from(scalar_buffer); + assert_eq!(vec, input.as_slice()); + assert_ne!(vec.as_ptr(), input.as_ptr()); + } } From f41c2a4e5a33e482e12351051d77d0e059f28e33 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 16 Mar 2024 18:41:14 -0400 Subject: [PATCH 11/11] Minor: add additional documentation about `BufWriter` (#5519) * Minor: add additional documentation about BufWriter * Update object_store/src/buffered.rs * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Format --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- object_store/src/buffered.rs | 7 ++++++- object_store/src/lib.rs | 13 +++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/object_store/src/buffered.rs b/object_store/src/buffered.rs index fdefe599f79e..9299e1147bc1 100644 --- a/object_store/src/buffered.rs +++ b/object_store/src/buffered.rs @@ -207,6 +207,10 @@ impl AsyncBufRead for BufReader { /// An async buffered writer compatible with the tokio IO traits /// +/// This writer adaptively uses [`ObjectStore::put`] or +/// [`ObjectStore::put_multipart`] depending on the amount of data that has +/// been written. +/// /// Up to `capacity` bytes will be buffered in memory, and flushed on shutdown /// using [`ObjectStore::put`]. If `capacity` is exceeded, data will instead be /// streamed using [`ObjectStore::put_multipart`] @@ -255,7 +259,8 @@ impl BufWriter { } } - /// Returns the [`MultipartId`] if multipart upload + /// Returns the [`MultipartId`] of the multipart upload created by this + /// writer, if any. pub fn multipart_id(&self) -> Option<&MultipartId> { self.multipart_id.as_ref() } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 8132002b6e01..4960f3ba390a 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -88,11 +88,11 @@ //! //! # Why not a Filesystem Interface? //! -//! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs -//! of object stores and not filesystems, opting to provide stateless APIs instead of the cursor -//! based interfaces such as [`Read`] or [`Seek`] favoured by filesystems. +//! The [`ObjectStore`] interface is designed to mirror the APIs +//! of object stores and *not* filesystems, and thus has stateless APIs instead +//! of cursor based interfaces such as [`Read`] or [`Seek`] available in filesystems. //! -//! This provides some compelling advantages: +//! This design provides the following advantages: //! //! * All operations are atomic, and readers cannot observe partial and/or failed writes //! * Methods map directly to object store APIs, providing both efficiency and predictability @@ -100,7 +100,12 @@ //! * Allows for functionality not native to filesystems, such as operation preconditions //! and atomic multipart uploads //! +//! This crate does provide [`BufReader`] and [`BufWriter`] adapters +//! which provide a more filesystem-like API for working with the +//! [`ObjectStore`] trait, however, they should be used with care +//! //! [`BufReader`]: buffered::BufReader +//! [`BufWriter`]: buffered::BufWriter //! //! # Adapters //!