Skip to content

Commit

Permalink
Logical Nullability (#4691)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Aug 14, 2023
1 parent 820e40a commit 979a070
Show file tree
Hide file tree
Showing 19 changed files with 333 additions and 76 deletions.
13 changes: 9 additions & 4 deletions arrow-arith/src/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ where
return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
}

let nulls = NullBuffer::union(a.nulls(), b.nulls());
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());

let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
// JUSTIFICATION
Expand Down Expand Up @@ -248,7 +248,7 @@ where
))));
}

let nulls = NullBuffer::union(a.nulls(), b.nulls());
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());

let mut builder = a.into_builder()?;

Expand Down Expand Up @@ -296,7 +296,9 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls(len, a, b, op)
} else {
let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref())
.unwrap();

let mut buffer = BufferBuilder::<O::Native>::new(len);
buffer.append_n_zeroed(len);
Expand Down Expand Up @@ -355,7 +357,10 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls_mut(len, a, b, op)
} else {
let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref())
.unwrap();

let mut builder = a.into_builder()?;

let slice = builder.values_slice_mut();
Expand Down
14 changes: 3 additions & 11 deletions arrow-arith/src/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
use arrow_buffer::{BooleanBuffer, NullBuffer};
use arrow_schema::{ArrowError, DataType};
use arrow_schema::ArrowError;

/// Logical 'and' boolean values with Kleene logic
///
Expand Down Expand Up @@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result<BooleanArray, ArrowError> {
/// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true]));
/// ```
pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.nulls() {
// NullArray has no nulls buffer yet all values are null
None if input.data_type() == &DataType::Null => {
BooleanBuffer::new_set(input.len())
}
let values = match input.logical_nulls() {
None => BooleanBuffer::new_unset(input.len()),
Some(nulls) => !nulls.inner(),
};
Expand All @@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
/// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false]));
/// ```
pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.nulls() {
// NullArray has no nulls buffer yet all values are null
None if input.data_type() == &DataType::Null => {
BooleanBuffer::new_unset(input.len())
}
let values = match input.logical_nulls() {
None => BooleanBuffer::new_set(input.len()),
Some(n) => n.inner().clone(),
};
Expand Down
7 changes: 5 additions & 2 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ impl BooleanArray {
where
F: FnMut(T::Item) -> bool,
{
let nulls = left.nulls().cloned();
let nulls = left.logical_nulls();
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i))
Expand Down Expand Up @@ -239,7 +239,10 @@ impl BooleanArray {
{
assert_eq!(left.len(), right.len());

let nulls = NullBuffer::union(left.nulls(), right.nulls());
let nulls = NullBuffer::union(
left.logical_nulls().as_ref(),
right.logical_nulls().as_ref(),
);
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i), right.value_unchecked(i))
Expand Down
49 changes: 49 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,31 @@ impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
self.keys.nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
match self.values.nulls() {
None => self.nulls().cloned(),
Some(value_nulls) => {
let mut builder = BooleanBufferBuilder::new(self.len());
match self.keys.nulls() {
Some(n) => builder.append_buffer(n.inner()),
None => builder.append_n(self.len(), true),
}
for (idx, k) in self.keys.values().iter().enumerate() {
let k = k.as_usize();
// Check range to allow for nulls
if k < value_nulls.len() && value_nulls.is_null(k) {
builder.set_bit(idx, false);
}
}
Some(builder.finish().into())
}
}
}

fn is_nullable(&self) -> bool {
!self.is_empty() && (self.nulls().is_some() || self.values.is_nullable())
}

fn get_buffer_memory_size(&self) -> usize {
self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size()
}
Expand Down Expand Up @@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a,
self.dictionary.nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
self.dictionary.logical_nulls()
}

fn is_nullable(&self) -> bool {
self.dictionary.is_nullable()
}

fn get_buffer_memory_size(&self) -> usize {
self.dictionary.get_buffer_memory_size()
}
Expand Down Expand Up @@ -1253,4 +1286,20 @@ mod tests {
assert_eq!(v, expected, "{idx}");
}
}

#[test]
fn test_iterator_nulls() {
let keys = Int32Array::new(
vec![0, 700, 1, 2].into(),
Some(NullBuffer::from(vec![true, false, true, true])),
);
let values = Int32Array::from(vec![Some(50), None, Some(2)]);
let dict = DictionaryArray::new(keys, Arc::new(values));
let values: Vec<_> = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect();
assert_eq!(values, &[Some(50), None, None, Some(2)])
}
}
6 changes: 3 additions & 3 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ impl FixedSizeListArray {
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
/// * `values.data_type() != field.data_type()`
/// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())`
/// * `!field.is_nullable() && !nulls.expand(size).contains(values.logical_nulls())`
pub fn try_new(
field: FieldRef,
size: i32,
Expand Down Expand Up @@ -181,11 +181,11 @@ impl FixedSizeListArray {
)));
}

if let Some(a) = values.nulls() {
if let Some(a) = values.logical_nulls() {
let nulls_valid = field.is_nullable()
|| nulls
.as_ref()
.map(|n| n.expand(size as _).contains(a))
.map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default();

if !nulls_valid {
Expand Down
4 changes: 2 additions & 2 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
///
/// * `offsets.len() - 1 != nulls.len()`
/// * `offsets.last() > values.len()`
/// * `!field.is_nullable() && values.null_count() != 0`
/// * `!field.is_nullable() && values.is_nullable()`
/// * `field.data_type() != values.data_type()`
pub fn try_new(
field: FieldRef,
Expand Down Expand Up @@ -189,7 +189,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
)));
}
}
if !field.is_nullable() && values.null_count() != 0 {
if !field.is_nullable() && values.is_nullable() {
return Err(ArrowError::InvalidArgumentError(format!(
"Non-nullable field of {}ListArray {:?} cannot contain nulls",
OffsetSize::PREFIX,
Expand Down
60 changes: 58 additions & 2 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn offset(&self) -> usize;

/// Returns the null buffers of this array if any
/// Returns the null buffer of this array if any
///
/// Note: some arrays can encode their nullability in their children, for example,
/// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer,
/// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this
fn nulls(&self) -> Option<&NullBuffer>;

/// Returns the logical null buffer of this array if any
///
/// In most cases this will be the same as [`Array::nulls`], except for:
///
/// * DictionaryArray where [`DictionaryArray::values`] contains nulls
/// * RunArray where [`RunArray::values`] contains nulls
/// * NullArray where all indices are nulls
///
/// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability
/// of these arrays, beyond what is encoded in [`Array::nulls`]
fn logical_nulls(&self) -> Option<NullBuffer> {
self.nulls().cloned()
}

/// Returns whether the element at `index` is null.
/// When using this function on a slice, the index is relative to the slice.
///
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`]
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
/// ```
Expand All @@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// Returns whether the element at `index` is not null.
/// When using this function on a slice, the index is relative to the slice.
///
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`]
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
/// ```
Expand All @@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync {
!self.is_null(index)
}

/// Returns the total number of null values in this array.
/// Returns the total number of physical null values in this array.
///
/// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`],
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
Expand All @@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync {
self.nulls().map(|n| n.null_count()).unwrap_or_default()
}

/// Returns `false` if the array is guaranteed to not contain any logical nulls
///
/// In general this will be equivalent to `Array::null_count() != 0` but may differ in the
/// presence of logical nullability, see [`Array::logical_nulls`].
///
/// Implementations will return `true` unless they can cheaply prove no logical nulls
/// are present. For example a [`DictionaryArray`] with nullable values will still return true,
/// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key,
/// and therefore would not appear in [`Array::logical_nulls`].
fn is_nullable(&self) -> bool {
self.null_count() != 0
}

/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
fn get_buffer_memory_size(&self) -> usize;
Expand Down Expand Up @@ -277,6 +317,10 @@ impl Array for ArrayRef {
self.as_ref().nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
self.as_ref().logical_nulls()
}

fn is_null(&self, index: usize) -> bool {
self.as_ref().is_null(index)
}
Expand All @@ -289,6 +333,10 @@ impl Array for ArrayRef {
self.as_ref().null_count()
}

fn is_nullable(&self) -> bool {
self.as_ref().is_nullable()
}

fn get_buffer_memory_size(&self) -> usize {
self.as_ref().get_buffer_memory_size()
}
Expand Down Expand Up @@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T {
T::nulls(self)
}

fn logical_nulls(&self) -> Option<NullBuffer> {
T::logical_nulls(self)
}

fn is_null(&self, index: usize) -> bool {
T::is_null(self, index)
}
Expand All @@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T {
T::null_count(self)
}

fn is_nullable(&self) -> bool {
T::is_nullable(self)
}

fn get_buffer_memory_size(&self) -> usize {
T::get_buffer_memory_size(self)
}
Expand Down
33 changes: 15 additions & 18 deletions arrow-array/src/array/null_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ use std::sync::Arc;
///
/// let array = NullArray::new(10);
///
/// assert!(array.is_nullable());
/// assert_eq!(array.len(), 10);
/// assert_eq!(array.null_count(), 10);
/// assert_eq!(array.null_count(), 0);
/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
/// ```
#[derive(Clone)]
pub struct NullArray {
Expand Down Expand Up @@ -107,22 +109,12 @@ impl Array for NullArray {
None
}

/// Returns whether the element at `index` is null.
/// All elements of a `NullArray` are always null.
fn is_null(&self, _index: usize) -> bool {
true
fn logical_nulls(&self) -> Option<NullBuffer> {
(self.len != 0).then(|| NullBuffer::new_null(self.len))
}

/// Returns whether the element at `index` is valid.
/// All elements of a `NullArray` are always invalid.
fn is_valid(&self, _index: usize) -> bool {
false
}

/// Returns the total number of null values in this array.
/// The null count of a `NullArray` always equals its length.
fn null_count(&self) -> usize {
self.len()
fn is_nullable(&self) -> bool {
!self.is_empty()
}

fn get_buffer_memory_size(&self) -> usize {
Expand Down Expand Up @@ -176,8 +168,10 @@ mod tests {
let null_arr = NullArray::new(32);

assert_eq!(null_arr.len(), 32);
assert_eq!(null_arr.null_count(), 32);
assert!(!null_arr.is_valid(0));
assert_eq!(null_arr.null_count(), 0);
assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
assert!(null_arr.is_valid(0));
assert!(null_arr.is_nullable());
}

#[test]
Expand All @@ -186,7 +180,10 @@ mod tests {

let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 16);
assert_eq!(array2.null_count(), 0);
assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
assert!(array2.is_valid(0));
assert!(array2.is_nullable());
}

#[test]
Expand Down
Loading

0 comments on commit 979a070

Please sign in to comment.