diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index ff8b82a5d943..6c9436819038 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -194,7 +194,10 @@ where return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); + let nulls = NullBuffer::union( + a.logical_nulls().as_ref().map(|n| n.as_ref()), + b.logical_nulls().as_ref().map(|n| n.as_ref()), + ); let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); // JUSTIFICATION @@ -244,7 +247,10 @@ where )))); } - let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); + let nulls = NullBuffer::union( + a.logical_nulls().as_ref().map(|n| n.as_ref()), + b.logical_nulls().as_ref().map(|n| n.as_ref()), + ); let mut builder = a.into_builder()?; @@ -292,8 +298,11 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls(len, a, b, op) } else { - let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); + let nulls = NullBuffer::union( + a.logical_nulls().as_ref().map(|n| n.as_ref()), + b.logical_nulls().as_ref().map(|n| n.as_ref()), + ) + .unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); @@ -351,8 +360,11 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls_mut(len, a, b, op) } else { - let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); + let nulls = NullBuffer::union( + a.logical_nulls().as_ref().map(|n| n.as_ref()), + b.logical_nulls().as_ref().map(|n| n.as_ref()), + ) + .unwrap(); let mut builder = a.into_builder()?; diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index fe374d965714..ff4cad44e1e9 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -214,7 +214,7 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let nulls = left.logical_nulls(); + let nulls = left.logical_nulls().map(|n| n.into_owned()); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) @@ -245,8 +245,8 @@ impl BooleanArray { assert_eq!(left.len(), right.len()); let nulls = NullBuffer::union( - left.logical_nulls().as_ref(), - right.logical_nulls().as_ref(), + left.logical_nulls().as_ref().map(|n| n.as_ref()), + right.logical_nulls().as_ref().map(|n| n.as_ref()), ); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 1f4d83b1c5d0..4fb59e32378d 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -29,6 +29,7 @@ use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; +use std::borrow::Cow; use std::sync::Arc; /// A [`DictionaryArray`] indexed by `i8` @@ -720,8 +721,8 @@ impl Array for DictionaryArray { self.keys.nulls() } - fn logical_nulls(&self) -> Option { - match self.values.nulls() { + fn logical_nulls(&self) -> Option> { + let logical_nulls = match self.values.nulls() { None => self.nulls().cloned(), Some(value_nulls) => { let mut builder = BooleanBufferBuilder::new(self.len()); @@ -738,7 +739,8 @@ impl Array for DictionaryArray { } Some(builder.finish().into()) } - } + }; + logical_nulls.map(Cow::Owned) } fn is_nullable(&self) -> bool { @@ -854,7 +856,7 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary.nulls() } - fn logical_nulls(&self) -> Option { + fn logical_nulls(&self) -> Option> { self.dictionary.logical_nulls() } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index f19406c1610b..64468fd6b96b 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -17,55 +17,45 @@ //! The concrete array definitions -mod binary_array; +use std::any::Any; +use std::borrow::Cow; +use std::sync::Arc; -use crate::types::*; use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; -use std::any::Any; -use std::sync::Arc; - pub use binary_array::*; - -mod boolean_array; pub use boolean_array::*; - -mod byte_array; pub use byte_array::*; - -mod dictionary_array; pub use dictionary_array::*; - -mod fixed_size_binary_array; pub use fixed_size_binary_array::*; - -mod fixed_size_list_array; pub use fixed_size_list_array::*; - -mod list_array; pub use list_array::*; - -mod map_array; pub use map_array::*; - -mod null_array; pub use null_array::*; - -mod primitive_array; pub use primitive_array::*; - -mod string_array; +pub use run_array::*; pub use string_array::*; - -mod struct_array; pub use struct_array::*; - -mod union_array; pub use union_array::*; +use crate::types::*; + +mod binary_array; + +mod boolean_array; +mod byte_array; +mod dictionary_array; +mod fixed_size_binary_array; +mod fixed_size_list_array; +mod list_array; +mod map_array; +mod null_array; +mod primitive_array; mod run_array; -pub use run_array::*; +mod string_array; +mod struct_array; +mod union_array; /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { @@ -182,7 +172,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// you can use the slower [`Array::logical_nulls`] to obtain a computed mask . fn nulls(&self) -> Option<&NullBuffer>; - /// Returns a potentially computed [`NullBuffer`] that represent the logical null values of this array, if any. + /// Returns a potentially computed [`NullBuffer`] that represent the logical + /// null values of this array, if any. /// /// In most cases this will be the same as [`Array::nulls`], except for: /// @@ -192,8 +183,26 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability /// of these arrays, beyond what is encoded in [`Array::nulls`] - fn logical_nulls(&self) -> Option { - self.nulls().cloned() + /// + /// Returns a [`Cow<'_, NullBuffer>`] to avoid a copy of the null buffer + /// when it is already present. + /// + /// # Example: + /// ``` + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_buffer::NullBuffer; + /// # let array: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); + /// + /// // to get a Option<&NullBuffer> requires using `as_ref`: + /// let logical_nulls = array.logical_nulls(); + /// let logical_nulls_ref: Option<&NullBuffer> = logical_nulls.as_ref().map(|n| n.as_ref()); + /// + /// // use into_owned to get an owned `NullBuffer` + /// let logical_nulls = array.logical_nulls(); + /// let logical_nulls_owned: Option = logical_nulls.map(|n| n.into_owned()); + /// ``` + fn logical_nulls(&self) -> Option> { + self.nulls().map(Cow::Borrowed) } /// Returns whether the element at `index` is null according to [`Array::nulls`] @@ -321,7 +330,7 @@ impl Array for ArrayRef { self.as_ref().nulls() } - fn logical_nulls(&self) -> Option { + fn logical_nulls(&self) -> Option> { self.as_ref().logical_nulls() } @@ -387,7 +396,7 @@ impl<'a, T: Array> Array for &'a T { T::nulls(self) } - fn logical_nulls(&self) -> Option { + fn logical_nulls(&self) -> Option> { T::logical_nulls(self) } @@ -705,12 +714,14 @@ where #[cfg(test)] mod tests { - use super::*; - use crate::cast::{as_union_array, downcast_array}; - use crate::downcast_run_array; use arrow_buffer::MutableBuffer; use arrow_schema::{Field, Fields, UnionFields, UnionMode}; + use crate::cast::{as_union_array, downcast_array}; + use crate::downcast_run_array; + + use super::*; + #[test] fn test_empty_primitive() { let array = new_empty_array(&DataType::Int32); diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index af3ec0b57d27..177cfc355655 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -23,6 +23,7 @@ use arrow_buffer::buffer::NullBuffer; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; +use std::borrow::Cow; use std::sync::Arc; /// An array of [null values](https://arrow.apache.org/docs/format/Columnar.html#null-layout) @@ -109,8 +110,8 @@ impl Array for NullArray { None } - fn logical_nulls(&self) -> Option { - (self.len != 0).then(|| NullBuffer::new_null(self.len)) + fn logical_nulls(&self) -> Option> { + (self.len != 0).then(|| Cow::Owned(NullBuffer::new_null(self.len))) } fn is_nullable(&self) -> bool { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 4877f9f850a3..3ad9d6aad2c8 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::borrow::Cow; use std::sync::Arc; use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer}; @@ -339,7 +340,7 @@ impl Array for RunArray { None } - fn logical_nulls(&self) -> Option { + fn logical_nulls(&self) -> Option> { let len = self.len(); let nulls = self.values.logical_nulls()?; let mut out = BooleanBufferBuilder::new(len); @@ -369,7 +370,7 @@ impl Array for RunArray { } // Sanity check assert_eq!(out.len(), len); - Some(out.finish().into()) + Some(Cow::Owned(out.finish().into())) } fn is_nullable(&self) -> bool { @@ -593,7 +594,7 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array.nulls() } - fn logical_nulls(&self) -> Option { + fn logical_nulls(&self) -> Option> { self.run_array.logical_nulls() } @@ -660,12 +661,13 @@ mod tests { use rand::thread_rng; use rand::Rng; - use super::*; use crate::builder::PrimitiveRunBuilder; use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int32Array, StringArray}; + use super::*; + fn build_input_array(size: usize) -> Vec> { // The input array is created by shuffling and repeating // the seed values random number of times. diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c1..f5cacbeb3227 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -17,12 +17,13 @@ //! Idiomatic iterators for [`Array`](crate::Array) +use arrow_buffer::NullBuffer; + use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; -use arrow_buffer::NullBuffer; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -56,7 +57,7 @@ impl ArrayIter { /// create a new iterator pub fn new(array: T) -> Self { let len = array.len(); - let logical_nulls = array.logical_nulls(); + let logical_nulls = array.logical_nulls().map(|n| n.into_owned()); ArrayIter { array, logical_nulls, diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index bfb1f64e2eb8..aff48c1fa266 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -252,8 +252,8 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { // Scalar is null, other side is non-scalar and nullable match op { - Op::Distinct => a.into_inner().into(), - Op::NotDistinct => a.into_inner().not().into(), + Op::Distinct => a.into_owned().into_inner().into(), + Op::NotDistinct => a.into_owned().into_inner().not().into(), _ => BooleanArray::new_null(len), } } @@ -276,7 +276,7 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result (nulls.inner() & &values()).into(), - _ => BooleanArray::new(values(), Some(nulls)), + _ => BooleanArray::new(values(), Some(nulls.into_owned())), }, } } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 7d749da51327..09cd4a8f0c60 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -770,7 +770,7 @@ impl LexicographicalComparator { // flatten and convert build comparators let values = column.values.as_ref(); Ok(( - values.logical_nulls(), + values.logical_nulls().map(|n| n.into_owned()), build_compare(values, values)?, column.options.unwrap_or_default(), )) diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs index d0b6fcfc3ac9..535a732280e4 100644 --- a/arrow-select/src/dictionary.rs +++ b/arrow-select/src/dictionary.rs @@ -152,7 +152,7 @@ pub fn merge_dictionary_values( for (idx, dictionary) in dictionaries.iter().enumerate() { let mask = masks.and_then(|m| m.get(idx)); let key_mask = match (dictionary.logical_nulls(), mask) { - (Some(n), None) => Some(n.into_inner()), + (Some(n), None) => Some(n.into_owned().into_inner()), (None, Some(n)) => Some(n.clone()), (Some(n), Some(m)) => Some(n.inner() & m), (None, None) => None,