From 51ea05928d89428ca349e88d77df408bb0ff8f47 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 08:39:44 -0500 Subject: [PATCH 01/20] Add `RunEndEncodedArray` --- arrow-array/src/array/mod.rs | 18 + .../src/array/run_end_encoded_array.rs | 488 ++++++++++++++++++ .../builder/generic_byte_ree_array_builder.rs | 419 +++++++++++++++ .../src/builder/generic_bytes_builder.rs | 9 + arrow-array/src/builder/mod.rs | 4 + .../builder/primitive_ree_array_builder.rs | 218 ++++++++ arrow-array/src/types.rs | 11 + arrow-data/src/data.rs | 59 ++- arrow-data/src/equal/mod.rs | 1 + arrow-data/src/transform/mod.rs | 16 + arrow-integration-test/src/datatype.rs | 1 + arrow-ipc/src/convert.rs | 1 + arrow-schema/src/datatype.rs | 23 + arrow-schema/src/error.rs | 4 + arrow-schema/src/field.rs | 1 + parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/schema/mod.rs | 1 + 17 files changed, 1272 insertions(+), 4 deletions(-) create mode 100644 arrow-array/src/array/run_end_encoded_array.rs create mode 100644 arrow-array/src/builder/generic_byte_ree_array_builder.rs create mode 100644 arrow-array/src/builder/primitive_ree_array_builder.rs diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1e17e35d0f6d..1699282caa5d 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -64,6 +64,9 @@ pub use struct_array::*; mod union_array; pub use union_array::*; +mod run_end_encoded_array; +pub use run_end_encoded_array::*; + /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. pub trait Array: std::fmt::Debug + Send + Sync { @@ -579,6 +582,20 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } dt => panic!("Unexpected dictionary key type {:?}", dt), }, + DataType::RunEndEncoded(ref run_ends_type, _) => { + match run_ends_type.data_type() { + DataType::Int16 => { + Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + } + DataType::Int32 => { + Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + } + DataType::Int64 => { + Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + } + _ => unreachable!(), + } + } DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, @@ -737,6 +754,7 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { new_null_sized_decimal(data_type, length, std::mem::size_of::()) } DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, length, 32), + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs new file mode 100644 index 000000000000..ce18c3b19811 --- /dev/null +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -0,0 +1,488 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, Field}; + +use crate::{ + builder::StringREEArrayBuilder, + make_array, + types::{ArrowRunEndIndexType, Int16Type, Int32Type, Int64Type}, + Array, ArrayRef, PrimitiveArray, +}; + +/// +/// A run-end encoding (REE) is a variation of [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding). +/// This encoding is good for representing data containing same values repeated consecutively +/// called runs. Each run is represented by the value of data and the index at which the run ends. +/// +/// [`RunEndEncodedArray`] has `run_ends` array and `values` array of same length. +/// The `run_ends` array stores the indexes at which the run ends. The `values` array +/// stores the value of the run. Below example illustrates how a logical array is represented in +/// [`RunEndEncodedArray`] +/// +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐ +/// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ +/// │ │ A │ │ 2 │ │ │ A │ +/// ├─────────────────┤ ├─────────┤ ├─────────────────┤ +/// │ │ D │ │ 3 │ │ │ A │ run length of 'A' = keys[0] - 0 = 2 +/// ├─────────────────┤ ├─────────┤ ├─────────────────┤ +/// │ │ B │ │ 6 │ │ │ D │ run length of 'D' = keys[1] - keys[0] = 1 +/// └─────────────────┘ └─────────┘ ├─────────────────┤ +/// │ values run_ends │ │ B │ +/// ├─────────────────┤ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘ │ B │ +/// ├─────────────────┤ +/// RunEndEncodedArray │ B │ run length of 'B' = keys[2] - keys[1] = 3 +/// length = 3 └─────────────────┘ +/// +/// Logical array +/// Contents +/// ``` + +pub struct RunEndEncodedArray { + data: ArrayData, + run_ends: PrimitiveArray, + values: ArrayRef, +} + +impl RunEndEncodedArray { + /// Attempts to create RunEndEncodedArray using given run_ends (index where a run ends) + /// and the values (value of the run). Returns an error if the given data is not compatible + /// with RunEndEncoded specification. + pub fn try_new( + run_ends: &PrimitiveArray, + values: &dyn Array, + ) -> Result { + let run_ends_type = run_ends.data_type().clone(); + let values_type = values.data_type().clone(); + let ree_array_type = DataType::RunEndEncoded( + Box::new(Field::new("run_ends", run_ends_type, false)), + Box::new(Field::new("values", values_type, true)), + ); + let builder = ArrayDataBuilder::new(ree_array_type) + .add_child_data(run_ends.data().clone()) + .add_child_data(values.data().clone()); + + // `build_unchecked` is used to avoid recursive validation of child arrays. + let array_data = unsafe { builder.build_unchecked() }; + + // Safety: `validate_data` checks below + // 1. run_ends array does not have null values + // 2. run_ends array has non-zero and strictly increasing values. + // 3. The length of run_ends array and values array are the same. + array_data.validate_data()?; + + Ok(array_data.into()) + } + /// Returns a reference to run_ends array + pub fn run_ends(&self) -> &PrimitiveArray { + &self.run_ends + } + + /// Returns a reference to values array + pub fn values(&self) -> &ArrayRef { + &self.values + } +} + +impl From for RunEndEncodedArray { + fn from(data: ArrayData) -> Self { + match data.data_type() { + DataType::RunEndEncoded(run_ends_data_type, _) => { + assert_eq!( + &R::DATA_TYPE, + run_ends_data_type.data_type(), + "Data type mismatch for run_ends array, expected {} got {}", + R::DATA_TYPE, + run_ends_data_type.data_type() + ); + } + _ => { + panic!("Invalid data type for RunEndEncodedArray. The data type should be DataType::RunEndEncoded"); + } + } + + // Safety: `validate_data` checks below + // 1. The given array data has exactly two child arrays. + // 2. The first child array (run_ends) has valid data type. + // 3. run_ends array does not have null values + // 4. run_ends array has non-zero and strictly increasing values. + // 5. The length of run_ends array and values array are the same. + data.validate_data().unwrap(); + + let run_ends = PrimitiveArray::::from(data.child_data()[0].clone()); + let values = make_array(data.child_data()[1].clone()); + Self { + data, + run_ends, + values, + } + } +} + +impl From> for ArrayData { + fn from(array: RunEndEncodedArray) -> Self { + array.data + } +} + +impl Array for RunEndEncodedArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +impl std::fmt::Debug for RunEndEncodedArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + writeln!( + f, + "RunEndEncodedArray {{run_ends: {:?}, values: {:?}}}", + self.run_ends, self.values + ) + } +} + +/// Constructs a `RunEndEncodedArray` from an iterator of optional strings. +/// +/// # Example: +/// ``` +/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int8Type}; +/// +/// let test = vec!["a", "a", "b", "c", "c"]; +/// let array: RunEndEncodedArray = test +/// .iter() +/// .map(|&x| if x == "b" { None } else { Some(x) }) +/// .collect(); +/// assert_eq!( +/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` +impl<'a, T: ArrowRunEndIndexType> FromIterator> + for RunEndEncodedArray +{ + fn from_iter>>(iter: I) -> Self { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = StringREEArrayBuilder::with_capacity(lower, 256); + it.for_each(|i| { + if let Some(i) = i { + builder + .append_value(i) + .expect("Unable to append a value to a run end encoded array."); + } else { + builder + .append_null() + .expect("Unable to append null value to run end encoded array."); + } + }); + + builder.finish() + } +} + +/// Constructs a `RunEndEncodedArray` from an iterator of strings. +/// +/// # Example: +/// +/// ``` +/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int8Type}; +/// +/// let test = vec!["a", "a", "b", "c"]; +/// let array: RunEndEncodedArray = test.into_iter().collect(); +/// assert_eq!( +/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` +impl<'a, T: ArrowRunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray { + fn from_iter>(iter: I) -> Self { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = StringREEArrayBuilder::with_capacity(lower, 256); + it.for_each(|i| { + builder + .append_value(i) + .expect("Unable to append a value to a dictionary array."); + }); + + builder.finish() + } +} +/// +/// A [`RunEndEncodedArray`] array where indexes of run ends is defined using `i16` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int16RunEndEncodedArray, Int16Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int16RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int16RunEndEncodedArray = RunEndEncodedArray; + +/// +/// A [`RunEndEncodedArray`] array where indexes of run ends is defined using `i32` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int32RunEndEncodedArray, Int32Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int32RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int32Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int32RunEndEncodedArray = RunEndEncodedArray; + +/// +/// A [`RunEndEncodedArray`] array where indexes of run ends is defined using `i64` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int64RunEndEncodedArray, Int64Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int64RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int64RunEndEncodedArray = RunEndEncodedArray; + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::builder::PrimitiveREEArrayBuilder; + use crate::types::{Int16Type, Int32Type, UInt32Type}; + use crate::{Array, Int16Array, Int32Array, StringArray}; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_schema::Field; + + #[test] + fn test_ree_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int8) + .len(8) + .add_buffer(Buffer::from( + &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(), + )) + .build() + .unwrap(); + + // Construct a run_ends array: + let run_ends_data = ArrayData::builder(DataType::Int16) + .len(8) + .add_buffer(Buffer::from( + &[4_i16, 6, 7, 9, 13, 18, 20, 22].to_byte_slice(), + )) + .build() + .unwrap(); + + // Construct a run ends encoded array from the above two + let run_ends_type = Field::new("run_ends", DataType::Int16, false); + let value_type = Field::new("values", DataType::Int8, true); + let ree_array_type = + DataType::RunEndEncoded(Box::new(run_ends_type), Box::new(value_type)); + let dict_data = ArrayData::builder(ree_array_type.clone()) + .add_child_data(run_ends_data.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let ree_array = Int16RunEndEncodedArray::from(dict_data); + + let values = ree_array.values(); + assert_eq!(&value_data, values.data()); + assert_eq!(&DataType::Int8, values.data_type()); + + let run_ends = ree_array.run_ends(); + assert_eq!(&run_ends_data, run_ends.data()); + assert_eq!(&DataType::Int16, run_ends.data_type()); + } + + #[test] + fn test_ree_array_fmt_debug() { + let mut builder = + PrimitiveREEArrayBuilder::::with_capacity(3); + builder.append_value(12345678).unwrap(); + builder.append_null().unwrap(); + builder.append_value(22345678).unwrap(); + let array = builder.finish(); + assert_eq!( + "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", + format!("{:?}", array) + ); + + let mut builder = + PrimitiveREEArrayBuilder::::with_capacity(20); + for _ in 0..20 { + builder.append_value(1).unwrap(); + } + let array = builder.finish(); + assert_eq!( + "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_ree_array_from_iter() { + let test = vec!["a", "a", "b", "c"]; + let array: RunEndEncodedArray = test + .iter() + .map(|&x| if x == "b" { None } else { Some(x) }) + .collect(); + assert_eq!( + "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", + format!("{:?}", array) + ); + + let array: RunEndEncodedArray = test.into_iter().collect(); + assert_eq!( + "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_ree_array_run_ends_as_primitive_array() { + let test = vec!["a", "b", "c", "a"]; + let array: RunEndEncodedArray = test.into_iter().collect(); + + let run_ends = array.run_ends(); + assert_eq!(&DataType::Int16, run_ends.data_type()); + assert_eq!(0, run_ends.null_count()); + assert_eq!(&[1, 2, 3, 4], run_ends.values()); + } + + #[test] + fn test_ree_array_as_primitive_array_with_null() { + let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; + let array: RunEndEncodedArray = test.into_iter().collect(); + + let run_ends = array.run_ends(); + assert_eq!(&DataType::Int32, run_ends.data_type()); + assert_eq!(0, run_ends.null_count()); + assert_eq!(5, run_ends.len()); + assert_eq!(&[1, 2, 3, 5, 6], run_ends.values()); + + let values_data = array.values(); + assert_eq!(2, values_data.null_count()); + assert_eq!(5, values_data.len()); + } + + #[test] + fn test_ree_array_all_nulls() { + let test = vec![None, None, None]; + let array: RunEndEncodedArray = test.into_iter().collect(); + + let run_ends = array.run_ends(); + assert_eq!(1, run_ends.len()); + assert_eq!(&[3], run_ends.values()); + + let values_data = array.values(); + assert_eq!(1, values_data.null_count()); + } + + #[test] + fn test_ree_array_try_new() { + let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + let array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + assert_eq!(array.run_ends().data_type(), &DataType::Int32); + assert_eq!(array.values().data_type(), &DataType::Utf8); + + assert_eq!(array.run_ends.null_count(), 0); + assert_eq!(array.values().null_count(), 1); + + assert_eq!( + "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_ree_array_int16_type_definition() { + let array: Int16RunEndEncodedArray = + vec!["a", "a", "b", "c", "c"].into_iter().collect(); + let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); + assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_ree_array_length_mismatch() { + let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect(); + + let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string()); + assert_eq!( + format!("{}", expected), + format!("{}", actual.err().unwrap()) + ); + } + + #[test] + fn test_ree_array_run_ends_with_null() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); + + let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); + assert_eq!( + format!("{}", expected), + format!("{}", actual.err().unwrap()) + ); + } + + #[test] + #[should_panic( + expected = "Data type mismatch for run_ends array, expected Int64 got Int32" + )] + fn test_ree_array_run_ends_data_type_mismatch() { + let a = RunEndEncodedArray::::from_iter(["32"]); + let _ = RunEndEncodedArray::::from(a.into_data()); + } +} diff --git a/arrow-array/src/builder/generic_byte_ree_array_builder.rs b/arrow-array/src/builder/generic_byte_ree_array_builder.rs new file mode 100644 index 000000000000..7cd291f7c739 --- /dev/null +++ b/arrow-array/src/builder/generic_byte_ree_array_builder.rs @@ -0,0 +1,419 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{ + types::{ + ArrowRunEndIndexType, BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, + Utf8Type, + }, + ArrowPrimitiveType, RunEndEncodedArray, +}; + +use super::{GenericByteBuilder, PrimitiveBuilder}; + +use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; + +/// Array builder for [`RunEndEndEncodedArray`] for String and Binary types. +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::GenericByteREEArrayBuilder; +/// # use arrow_array::{GenericByteArray, BinaryArray}; +/// # use arrow_array::types::{BinaryType, Int16Type}; +/// # use arrow_array::{Array, Int16Array}; +/// +/// let mut builder = +/// GenericByteREEArrayBuilder::::new(); +/// builder.append_value(b"abc").unwrap(); +/// builder.append_value(b"abc").unwrap(); +/// builder.append_null().unwrap(); +/// builder.append_value(b"def").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(2), Some(3), Some(4)]) +/// ); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert_eq!(ava.value(2), b"def"); +/// ``` +#[derive(Debug)] +pub struct GenericByteREEArrayBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: GenericByteBuilder, + current_value: Option>, + current_run_end_index: usize, +} + +impl Default for GenericByteREEArrayBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + fn default() -> Self { + Self::new() + } +} + +impl GenericByteREEArrayBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + /// Creates a new `GenericByteREEArrayBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: GenericByteBuilder::::new(), + current_value: None, + current_run_end_index: 0, + } + } + + /// Creates a new `GenericByteREEArrayBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + /// `data_capacity`: the expected number of bytes of run end encoded values + pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: GenericByteBuilder::::with_capacity( + capacity, + data_capacity, + ), + current_value: None, + current_run_end_index: 0, + } + } +} + +impl GenericByteREEArrayBuilder +where + R: ArrowRunEndIndexType, + V: ByteArrayType, +{ + /// Appends Option to the logical array encoded by the RunEndEncodedArray. + pub fn append_option( + &mut self, + input_value: Option>, + ) -> Result<(), ArrowError> { + match input_value { + Some(value) => self.append_value(value)?, + None => self.append_null()?, + } + Ok(()) + } + /// Appends value to the logical array encoded by the RunEndEncodedArray. + pub fn append_value( + &mut self, + input_value: impl AsRef, + ) -> Result<(), ArrowError> { + let value: &[u8] = input_value.as_ref().as_ref(); + match self.current_value.as_deref() { + None if self.current_run_end_index > 0 => { + self.append_run_end()?; + self.current_value = Some(value.to_owned()); + } + None if self.current_run_end_index == 0 => { + self.current_value = Some(value.to_owned()); + } + Some(current_value) if current_value != value => { + self.append_run_end()?; + self.current_value = Some(value.to_owned()); + } + _ => {} + } + self.current_run_end_index = self + .current_run_end_index + .checked_add(1) + .ok_or(ArrowError::RunEndIndexOverflowError)?; + Ok(()) + } + /// Appends null to the logical array encoded by the RunEndEncodedArray. + pub fn append_null(&mut self) -> Result<(), ArrowError> { + if self.current_value.is_some() { + self.append_run_end()?; + self.current_value = None; + } + self.current_run_end_index = self + .current_run_end_index + .checked_add(1) + .ok_or(ArrowError::RunEndIndexOverflowError)?; + Ok(()) + } + /// Creates the RunEndEncodedArray and resets the builder. + /// Panics if RunEndEncodedArray cannot be built. + pub fn finish(&mut self) -> RunEndEncodedArray { + //write the last run end to the array. + self.append_run_end().unwrap(); + + //reset the run end index to zero. + self.current_value = None; + self.current_run_end_index = 0; + + //build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + } + /// Creates the RunEndEncodedArray and without resetting the builder. + /// Panics if RunEndEncodedArray cannot be built. + pub fn finish_cloned(&mut self) -> RunEndEncodedArray { + //write the last run end to the array. + self.append_run_end().unwrap(); + + //build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish_cloned(); + let values_array = self.values_builder.finish_cloned(); + RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + //Appends the current run to the array + fn append_run_end(&mut self) -> Result<(), ArrowError> { + let run_end_index = R::Native::from_usize(self.current_run_end_index) + .ok_or_else(|| { + ArrowError::ParseError(format!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + })?; + self.run_ends_builder.append_value(run_end_index); + match self.current_value.as_deref() { + Some(value) => self.values_builder.append_slice(value), + None => self.values_builder.append_null(), + } + Ok(()) + } +} + +/// Array builder for [`RunEndEncodedArray`] that encodes strings ([`Utf8Type`]). +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded values are Strings. +/// +/// # use arrow_array::builder::StringREEArrayBuilder; +/// # use arrow_array::{Int16Array, StringArray}; +/// # use arrow_array::types::Int16Type; +/// +/// let mut builder = StringREEArrayBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value("abc").unwrap(); +/// builder.append_null(); +/// builder.append_value("def").unwrap(); +/// builder.append_value("def").unwrap(); +/// builder.append_value("abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), "def"); +/// assert_eq!(ava.value(3), "abc"); +/// +/// ``` +pub type StringREEArrayBuilder = GenericByteREEArrayBuilder; + +/// Array builder for [`RunEndEncodedArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringREEArrayBuilder`] for an example. +pub type LargeStringREEArrayBuilder = GenericByteREEArrayBuilder; + +/// Array builder for [`RunEndEncodedArray`] that encodes binary values([`BinaryType`]). +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded data is binary values. +/// +/// # use arrow_array::builder::BinaryREEArrayBuilder; +/// # use arrow_array::{BinaryArray, Int16Array}; +/// # use arrow_array::types::Int16Type; +/// +/// let mut builder = BinaryREEArrayBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value(b"abc").unwrap(); +/// builder.append_null(); +/// builder.append_value(b"def").unwrap(); +/// builder.append_value(b"def").unwrap(); +/// builder.append_value(b"abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), b"def"); +/// assert_eq!(ava.value(3), b"abc"); +/// +/// ``` +pub type BinaryREEArrayBuilder = GenericByteREEArrayBuilder; + +/// Array builder for [`RunEndEncodedArray`] that encodes large binary values([`LargeBinaryType`]). +/// See documentation of [`BinaryREEArrayBuilder`] for an example. +pub type LargeBinaryREEArrayBuilder = GenericByteREEArrayBuilder; + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::types::Int16Type; + use crate::GenericByteArray; + use crate::Int16Array; + + fn test_bytes_ree_array_buider(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteREEArrayBuilder::::new(); + builder.append_value(values[0]).unwrap(); + builder.append_value(values[0]).unwrap(); + builder.append_value(values[0]).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(values[1]).unwrap(); + builder.append_value(values[1]).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(3), Some(5), Some(7)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(*ava.value(0), *values[0]); + assert!(ava.is_null(1)); + assert_eq!(*ava.value(2), *values[1]); + } + + #[test] + fn test_string_ree_array_buider() { + test_bytes_ree_array_buider::(vec!["abc", "def"]); + } + + #[test] + fn test_binary_ree_array_buider() { + test_bytes_ree_array_buider::(vec![b"abc", b"def"]); + } + + fn test_bytes_ree_array_buider_finish_cloned(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteREEArrayBuilder::::new(); + + builder.append_value(values[0]).unwrap(); + builder.append_null().unwrap(); + builder.append_value(values[1]).unwrap(); + builder.append_value(values[1]).unwrap(); + builder.append_value(values[0]).unwrap(); + let mut array = builder.finish_cloned(); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava.value(0), values[0]); + assert!(ava.is_null(1)); + assert_eq!(ava.value(2), values[1]); + assert_eq!(ava.value(3), values[0]); + + builder.append_value(values[0]).unwrap(); + builder.append_value(values[0]).unwrap(); + builder.append_value(values[1]).unwrap(); + + array = builder.finish(); + + assert_eq!( + array.run_ends(), + &Int16Array::from( + vec![Some(1), Some(2), Some(4), Some(5), Some(7), Some(8),] + ) + ); + + // Values are polymorphic and so require a downcast. + let av2 = array.values(); + let ava2: &GenericByteArray = + av2.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava2.value(0), values[0]); + assert!(ava2.is_null(1)); + assert_eq!(ava2.value(2), values[1]); + assert_eq!(ava2.value(3), values[0]); + assert_eq!(ava2.value(4), values[0]); + assert_eq!(ava2.value(5), values[1]); + } + + #[test] + fn test_string_ree_array_buider_finish_cloned() { + test_bytes_ree_array_buider_finish_cloned::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_binary_ree_array_buider_finish_cloned() { + test_bytes_ree_array_buider_finish_cloned::(vec![ + b"abc", b"def", b"ghi", + ]); + } +} diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 73600d9e0a38..d4718cf1e443 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -99,6 +99,15 @@ impl GenericByteBuilder { self.offsets_builder.append(self.next_offset()); } + /// Appends a byte array slice into the builder. + #[inline] + pub fn append_slice(&mut self, value: &[u8]) { + self.value_builder.append_slice(value); + self.null_buffer_builder.append(true); + self.offsets_builder + .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); + } + /// Append an `Option` value into the builder. #[inline] pub fn append_option(&mut self, value: Option>) { diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 820ecd23bc5e..f08676bd0bdd 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -39,10 +39,14 @@ mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; pub use primitive_dictionary_builder::*; +mod primitive_ree_array_builder; +pub use primitive_ree_array_builder::*; mod struct_builder; pub use struct_builder::*; mod generic_bytes_dictionary_builder; pub use generic_bytes_dictionary_builder::*; +mod generic_byte_ree_array_builder; +pub use generic_byte_ree_array_builder::*; mod union_builder; pub use union_builder::*; diff --git a/arrow-array/src/builder/primitive_ree_array_builder.rs b/arrow-array/src/builder/primitive_ree_array_builder.rs new file mode 100644 index 000000000000..5f07c910e71e --- /dev/null +++ b/arrow-array/src/builder/primitive_ree_array_builder.rs @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{types::ArrowRunEndIndexType, ArrowPrimitiveType, RunEndEncodedArray}; + +use super::PrimitiveBuilder; + +use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; + +/// Array builder for [`RunEndEncodedArray`] that encodes primitive values. +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::PrimitiveREEArrayBuilder; +/// # use arrow_array::types::{UInt32Type, Int16Type}; +/// # use arrow_array::{Array, UInt32Array, Int16Array}; +/// +/// let mut builder = +/// PrimitiveREEArrayBuilder::::new(); +/// builder.append_value(1234).unwrap(); +/// builder.append_value(1234).unwrap(); +/// builder.append_value(1234).unwrap(); +/// builder.append_null().unwrap(); +/// builder.append_value(5678).unwrap(); +/// builder.append_value(5678).unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(3), Some(4), Some(6)]) +/// ); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); +/// ``` +#[derive(Debug)] +pub struct PrimitiveREEArrayBuilder +where + R: ArrowRunEndIndexType, + V: ArrowPrimitiveType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + current_value: Option, + current_run_end_index: usize, +} + +impl Default for PrimitiveREEArrayBuilder +where + R: ArrowRunEndIndexType, + V: ArrowPrimitiveType, +{ + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveREEArrayBuilder +where + R: ArrowRunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Creates a new `PrimitiveREEArrayBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: PrimitiveBuilder::new(), + current_value: None, + current_run_end_index: 0, + } + } + + /// Creates a new `PrimitiveREEArrayBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + pub fn with_capacity(capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: PrimitiveBuilder::with_capacity(capacity), + current_value: None, + current_run_end_index: 0, + } + } +} + +impl PrimitiveREEArrayBuilder +where + R: ArrowRunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Appends Option to the logical array encoded by the RunEndEncodedArray. + pub fn append_option(&mut self, value: Option) -> Result<(), ArrowError> { + if self.current_run_end_index == 0 { + self.current_run_end_index = 1; + self.current_value = value; + return Ok(()); + } + if self.current_value != value { + self.append_run_end()?; + self.current_value = value; + } + + self.current_run_end_index = self + .current_run_end_index + .checked_add(1) + .ok_or(ArrowError::RunEndIndexOverflowError)?; + + Ok(()) + } + /// Appends value to the logical array encoded by the run-ends array. + pub fn append_value(&mut self, value: V::Native) -> Result<(), ArrowError> { + self.append_option(Some(value)) + } + /// Appends null to the logical array encoded by the run-ends array. + pub fn append_null(&mut self) -> Result<(), ArrowError> { + self.append_option(None) + } + /// Creates the RunEndEncodedArray and resets the builder. + /// Panics if RunEndEncodedArray cannot be built. + pub fn finish(&mut self) -> RunEndEncodedArray { + //write the last run end to the array. + self.append_run_end().unwrap(); + + //reset the run index to zero. + self.current_value = None; + self.current_run_end_index = 0; + + //build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + } + /// Creates the RunEndEncodedArray and without resetting the builder. + /// Panics if RunEndEncodedArray cannot be built. + pub fn finish_cloned(&mut self) -> RunEndEncodedArray { + //write the last run end to the array. + self.append_run_end().unwrap(); + + //build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish_cloned(); + let values_array = self.values_builder.finish_cloned(); + RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + //Appends the current run to the array + fn append_run_end(&mut self) -> Result<(), ArrowError> { + let run_end_index = R::Native::from_usize(self.current_run_end_index) + .ok_or_else(|| { + ArrowError::ParseError(format!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + })?; + self.run_ends_builder.append_value(run_end_index); + self.values_builder.append_option(self.current_value); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::builder::PrimitiveREEArrayBuilder; + use crate::types::{Int16Type, UInt32Type}; + use crate::{Int16Array, UInt32Array}; + #[test] + fn test_primitive_ree_array_builder() { + let mut builder = PrimitiveREEArrayBuilder::::new(); + builder.append_value(1234).unwrap(); + builder.append_value(1234).unwrap(); + builder.append_value(1234).unwrap(); + builder.append_null().unwrap(); + builder.append_value(5678).unwrap(); + builder.append_value(5678).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(3), Some(4), Some(6)]) + ); + + let av = array.values(); + + assert!(!av.is_null(0)); + assert!(av.is_null(1)); + assert!(!av.is_null(2)); + + // Values are polymorphic and so require a downcast. + let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); + + assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); + } +} diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 7c41a469e30e..8f7dfed34808 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -240,6 +240,17 @@ impl ArrowDictionaryKeyType for UInt32Type {} impl ArrowDictionaryKeyType for UInt64Type {} +/// A subtype of primitive type that is used as run-ends index +/// in RunEndEncodedArray. +/// See +pub trait ArrowRunEndIndexType: ArrowPrimitiveType {} + +impl ArrowRunEndIndexType for Int16Type {} + +impl ArrowRunEndIndexType for Int32Type {} + +impl ArrowRunEndIndexType for Int64Type {} + /// A subtype of primitive type that represents temporal values. pub trait ArrowTemporalType: ArrowPrimitiveType {} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 258ee082da1b..ee0099009109 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -198,9 +198,9 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff ], _ => unreachable!(), }, - DataType::FixedSizeList(_, _) | DataType::Struct(_) => { - [empty_buffer, MutableBuffer::new(0)] - } + DataType::FixedSizeList(_, _) + | DataType::Struct(_) + | DataType::RunEndEncoded(_, _) => [empty_buffer, MutableBuffer::new(0)], DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, @@ -724,6 +724,12 @@ impl ArrayData { DataType::Dictionary(_, data_type) => { vec![Self::new_empty(data_type)] } + DataType::RunEndEncoded(run_ends, values) => { + vec![ + Self::new_empty(run_ends.data_type()), + Self::new_empty(values.data_type()), + ] + } }; // Data was constructed correctly above @@ -853,6 +859,19 @@ impl ArrayData { ))); } } + DataType::RunEndEncoded(run_ends_type, _) => { + if run_ends_type.is_nullable() { + return Err(ArrowError::InvalidArgumentError( + "The nullable should be set to false for the field defining run_ends array.".to_string() + )); + } + if !DataType::is_ree_run_ends_type(run_ends_type.data_type()) { + return Err(ArrowError::InvalidArgumentError(format!( + "RunEndEncodedArray run_ends types must be Int16, Int32 or Int64, but was {}", + run_ends_type.data_type() + ))); + } + } _ => {} }; @@ -998,6 +1017,25 @@ impl ArrayData { } Ok(()) } + DataType::RunEndEncoded(run_ends_field, values_field) => { + self.validate_num_child_data(2)?; + let run_ends_data = + self.get_valid_child_data(0, run_ends_field.data_type())?; + let values_data = + self.get_valid_child_data(1, values_field.data_type())?; + if run_ends_data.len != values_data.len { + return Err(ArrowError::InvalidArgumentError(format!( + "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", + run_ends_data.len, values_data.len + ))); + } + if run_ends_data.null_count() > 0 { + return Err(ArrowError::InvalidArgumentError( + "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), + )); + } + Ok(()) + } DataType::Union(fields, _, mode) => { self.validate_num_child_data(fields.len())?; @@ -1286,6 +1324,12 @@ impl ArrayData { _ => unreachable!(), } } + DataType::RunEndEncoded(run_ends, _values) => match run_ends.data_type() { + DataType::Int16 => self.check_run_ends::(), + DataType::Int32 => self.check_run_ends::(), + DataType::Int64 => self.check_run_ends::(), + _ => unreachable!(), + }, _ => { // No extra validation check required for other types Ok(()) @@ -1446,6 +1490,14 @@ impl ArrayData { }) } + /// Validates that each value in run_ends array is posittive and strictly increasing. + fn check_run_ends(&self) -> Result<(), ArrowError> + where + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, + { + //todo!(); + Ok(()) + } /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may /// return false when the arrays are logically equal @@ -1542,6 +1594,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, + DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, _, mode) => { let type_ids = BufferSpec::FixedWidth { byte_width: size_of::(), diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 85c595cfed1c..aff61e3d37e5 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -137,6 +137,7 @@ fn equal_values( }, DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 6a8c89d25a22..2a24b1cc2662 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -230,6 +230,7 @@ fn build_extend(array: &ArrayData) -> Extend { UnionMode::Sparse => union::build_extend_sparse(array), UnionMode::Dense => union::build_extend_dense(array), }, + DataType::RunEndEncoded(_, _) => todo!(), } } @@ -281,6 +282,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { UnionMode::Sparse => union::extend_nulls_sparse, UnionMode::Dense => union::extend_nulls_dense, }, + DataType::RunEndEncoded(_, _) => todo!(), }) } @@ -473,6 +475,20 @@ impl<'a> MutableArrayData<'a> { }) .collect::>(), }, + DataType::RunEndEncoded(_, _) => { + let run_ends_child = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + let value_child = arrays + .iter() + .map(|array| &array.child_data()[1]) + .collect::>(); + vec![ + MutableArrayData::new(run_ends_child, false, array_capacity), + MutableArrayData::new(value_child, use_nulls, array_capacity), + ] + } DataType::FixedSizeList(_, _) => { let childs = arrays .iter() diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index dd0b95b0a836..c2e326b4f2f3 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -357,6 +357,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Map(_, keys_sorted) => { json!({"name": "map", "keysSorted": keys_sorted}) } + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a60a19b866cb..305bb943cbbf 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -711,6 +711,7 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&children[..])), } } + RunEndEncoded(_, _) => todo!(), Map(map_field, keys_sorted) => { let child = build_field(fbb, map_field); let mut field_type = crate::MapBuilder::new(fbb); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index da1c20ddbd38..765f7f8e7874 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -242,6 +242,18 @@ pub enum DataType { /// child fields may be respectively "entries", "key", and "value", but this is /// not enforced. Map(Box, bool), + /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These + /// encodings are well-suited for representing data containing sequences of the + /// same value, called runs. Each run is represented as a value and an integer giving + /// the index in the array where the run ends. + /// + /// A run-end encoded array has no buffers by itself, but has two child arrays. The + /// first child array, called the run ends array, holds either 16, 32, or 64-bit + /// signed integers. The actual values of each run are held in the second child array. + /// + /// These child arrays are prescribed the standard names of "run_ends" and "values" + /// respectively. + RunEndEncoded(Box, Box), } /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. @@ -346,6 +358,13 @@ impl DataType { ) } + /// Returns true if this type is valid for run-ends array in RunEndEncodedArray + #[inline] + pub fn is_ree_run_ends_type(&self) -> bool { + use DataType::*; + matches!(self, Int16 | Int32 | Int64) + } + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, /// or Map), or a dictionary of a nested type pub fn is_nested(&self) -> bool { @@ -438,6 +457,10 @@ impl DataType { + (std::mem::size_of::() * fields.capacity()) } DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), + DataType::RunEndEncoded(run_ends, values) => { + run_ends.size() - std::mem::size_of_val(run_ends) + values.size() + - std::mem::size_of_val(values) + } } } } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index ea60572b3d4d..6213af8bcf10 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -41,6 +41,7 @@ pub enum ArrowError { /// Error during import or export to/from the C Data Interface CDataInterface(String), DictionaryKeyOverflowError, + RunEndIndexOverflowError, } impl ArrowError { @@ -96,6 +97,9 @@ impl Display for ArrowError { ArrowError::DictionaryKeyOverflowError => { write!(f, "Dictionary key bigger than the key type") } + ArrowError::RunEndIndexOverflowError => { + write!(f, "Run end encoded array index overflow error") + } } } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index a3275dcb3355..dc3ab3d6237f 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -410,6 +410,7 @@ impl Field { | DataType::List(_) | DataType::Map(_, _) | DataType::Dictionary(_, _) + | DataType::RunEndEncoded(_, _) | DataType::FixedSizeList(_, _) | DataType::FixedSizeBinary(_) | DataType::Utf8 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 311981593718..c459d40d73b9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -360,7 +360,7 @@ fn write_leaves( ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), )), - ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) => { + ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) | ArrowDataType::RunEndEncoded(_, _) => { Err(ParquetError::NYI( format!( "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 2ca4b7ef8a79..d81d6a69bbb9 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -507,6 +507,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { let dict_field = Field::new(name, *value.clone(), field.is_nullable()); arrow_to_parquet_type(&dict_field) } + DataType::RunEndEncoded(_, _) => Err(arrow_err!("Converting RunEndEncodedType to parquet not supported",)) } } From 98c9cb0c0b8a1db58a3245c7f50cc3c4bbf646f7 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 09:11:55 -0500 Subject: [PATCH 02/20] fix doctest and clippy issues --- arrow-array/src/array/run_end_encoded_array.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs index ce18c3b19811..f118df3bd8e3 100644 --- a/arrow-array/src/array/run_end_encoded_array.rs +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -173,7 +173,7 @@ impl std::fmt::Debug for RunEndEncodedArray { /// /// # Example: /// ``` -/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int8Type}; +/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int16Type}; /// /// let test = vec!["a", "a", "b", "c", "c"]; /// let array: RunEndEncodedArray = test @@ -213,12 +213,12 @@ impl<'a, T: ArrowRunEndIndexType> FromIterator> /// # Example: /// /// ``` -/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int8Type}; +/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int16Type}; /// /// let test = vec!["a", "a", "b", "c"]; /// let array: RunEndEncodedArray = test.into_iter().collect(); /// assert_eq!( -/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", /// format!("{:?}", array) /// ); /// ``` @@ -237,7 +237,7 @@ impl<'a, T: ArrowRunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray FromIterator<&'a str> for RunEndEncodedArray; /// -/// A [`RunEndEncodedArray`] array where indexes of run ends is defined using `i32` data type. +/// A [`RunEndEncodedArray`] array where run ends are stored using `i32` data type. /// /// # Example: Using `collect` /// ``` @@ -267,7 +267,7 @@ pub type Int16RunEndEncodedArray = RunEndEncodedArray; pub type Int32RunEndEncodedArray = RunEndEncodedArray; /// -/// A [`RunEndEncodedArray`] array where indexes of run ends is defined using `i64` data type. +/// A [`RunEndEncodedArray`] array where run ends are stored using `i64` data type. /// /// # Example: Using `collect` /// ``` @@ -276,7 +276,7 @@ pub type Int32RunEndEncodedArray = RunEndEncodedArray; /// /// let array: Int64RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); +/// assert_eq!(array.run_ends(), &Int64Array::from(vec![2, 3, 5])); /// assert_eq!(array.values(), &values); /// ``` pub type Int64RunEndEncodedArray = RunEndEncodedArray; @@ -317,7 +317,7 @@ mod tests { let value_type = Field::new("values", DataType::Int8, true); let ree_array_type = DataType::RunEndEncoded(Box::new(run_ends_type), Box::new(value_type)); - let dict_data = ArrayData::builder(ree_array_type.clone()) + let dict_data = ArrayData::builder(ree_array_type) .add_child_data(run_ends_data.clone()) .add_child_data(value_data.clone()) .build() From 88b3637029f130e65412df8f05f76ce1e0fc52d1 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 09:19:36 -0500 Subject: [PATCH 03/20] fix doc issues --- arrow-array/src/builder/generic_byte_ree_array_builder.rs | 2 +- arrow-array/src/builder/primitive_ree_array_builder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/generic_byte_ree_array_builder.rs b/arrow-array/src/builder/generic_byte_ree_array_builder.rs index 7cd291f7c739..02385b2c3349 100644 --- a/arrow-array/src/builder/generic_byte_ree_array_builder.rs +++ b/arrow-array/src/builder/generic_byte_ree_array_builder.rs @@ -123,7 +123,7 @@ where R: ArrowRunEndIndexType, V: ByteArrayType, { - /// Appends Option to the logical array encoded by the RunEndEncodedArray. + /// Appends optional value to the logical array encoded by the RunEndEncodedArray. pub fn append_option( &mut self, input_value: Option>, diff --git a/arrow-array/src/builder/primitive_ree_array_builder.rs b/arrow-array/src/builder/primitive_ree_array_builder.rs index 5f07c910e71e..7bb6ddb06cf7 100644 --- a/arrow-array/src/builder/primitive_ree_array_builder.rs +++ b/arrow-array/src/builder/primitive_ree_array_builder.rs @@ -113,7 +113,7 @@ where R: ArrowRunEndIndexType, V: ArrowPrimitiveType, { - /// Appends Option to the logical array encoded by the RunEndEncodedArray. + /// Appends optional value to the logical array encoded by the RunEndEncodedArray. pub fn append_option(&mut self, value: Option) -> Result<(), ArrowError> { if self.current_run_end_index == 0 { self.current_run_end_index = 1; From fbb2b5fcc6a19c6c7c88b640b482004dade3fdb7 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 09:27:44 -0500 Subject: [PATCH 04/20] fix doc issue --- arrow-array/src/builder/generic_byte_ree_array_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_byte_ree_array_builder.rs b/arrow-array/src/builder/generic_byte_ree_array_builder.rs index 02385b2c3349..68b022d856bf 100644 --- a/arrow-array/src/builder/generic_byte_ree_array_builder.rs +++ b/arrow-array/src/builder/generic_byte_ree_array_builder.rs @@ -28,7 +28,7 @@ use super::{GenericByteBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; -/// Array builder for [`RunEndEndEncodedArray`] for String and Binary types. +/// Array builder for [`RunEndEncodedArray`] for String and Binary types. /// /// # Example: /// From a9893ba8f19d15931a6c426d88dbe08a9e73e467 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 11:18:27 -0500 Subject: [PATCH 05/20] add validation for run_ends array and corresponding tests --- .../src/array/run_end_encoded_array.rs | 30 +++++++++++++ arrow-data/src/data.rs | 45 +++++++++++++++---- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs index f118df3bd8e3..5509f0656994 100644 --- a/arrow-array/src/array/run_end_encoded_array.rs +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -477,6 +477,36 @@ mod tests { ); } + #[test] + fn test_ree_array_run_ends_with_zeroes() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect(); + + let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string()); + assert_eq!( + format!("{}", expected), + format!("{}", actual.err().unwrap()) + ); + } + + #[test] + fn test_ree_array_run_ends_non_increasing() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect(); + + let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string()); + assert_eq!( + format!("{}", expected), + format!("{}", actual.err().unwrap()) + ); + } + #[test] #[should_panic( expected = "Data type mismatch for run_ends array, expected Int64 got Int32" diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index ee0099009109..a720e17f35b4 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1324,12 +1324,15 @@ impl ArrayData { _ => unreachable!(), } } - DataType::RunEndEncoded(run_ends, _values) => match run_ends.data_type() { - DataType::Int16 => self.check_run_ends::(), - DataType::Int32 => self.check_run_ends::(), - DataType::Int64 => self.check_run_ends::(), - _ => unreachable!(), - }, + DataType::RunEndEncoded(run_ends, _values) => { + let run_ends_data = self.child_data()[0].clone(); + match run_ends.data_type() { + DataType::Int16 => run_ends_data.check_run_ends::(), + DataType::Int32 => run_ends_data.check_run_ends::(), + DataType::Int64 => run_ends_data.check_run_ends::(), + _ => unreachable!(), + } + } _ => { // No extra validation check required for other types Ok(()) @@ -1495,8 +1498,34 @@ impl ArrayData { where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { - //todo!(); - Ok(()) + let values = self.typed_buffer::(0, self.len())?; + let mut prev_value: i64 = 0_i64; + values.iter().enumerate().try_for_each(|(ix, &inp_value)| { + let value: i64 = inp_value.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (can not convert to i64)", + ix, inp_value + )) + })?; + if value <= 0_i64 { + return Err(ArrowError::InvalidArgumentError(format!( + "The values in run_ends array should be strictly positive. Found value {} at index {} that does not match the criteria.", + value, + ix + ))); + } + if ix > 0 && value <= prev_value { + return Err(ArrowError::InvalidArgumentError(format!( + "The values in run_ends array should be strictly increasing. Found value {} at index {} with previous value {} that does not match the criteria.", + value, + ix, + prev_value + ))); + } + + prev_value = value; + Ok(()) + }) } /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may From 85904cc999787564dc8ea103471e3fac24808744 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 14:54:12 -0500 Subject: [PATCH 06/20] PR comments --- arrow-array/src/array/mod.rs | 2 +- arrow-array/src/array/run_end_encoded_array.rs | 7 ++++--- .../builder/generic_byte_ree_array_builder.rs | 16 ++++++++++------ .../src/builder/primitive_ree_array_builder.rs | 16 ++++++++++------ arrow-data/src/data.rs | 2 +- 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1699282caa5d..bda5bbdd97b1 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -593,7 +593,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Int64 => { Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef } - _ => unreachable!(), + dt => panic!("Unexpected data type for run_ends array {:?}", dt), } } DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs index 5509f0656994..fb95c6f2ab1e 100644 --- a/arrow-array/src/array/run_end_encoded_array.rs +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -43,15 +43,15 @@ use crate::{ /// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ /// │ │ A │ │ 2 │ │ │ A │ /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ -/// │ │ D │ │ 3 │ │ │ A │ run length of 'A' = keys[0] - 0 = 2 +/// │ │ D │ │ 3 │ │ │ A │ run length of 'A' = runs_ends[0] - 0 = 2 /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ -/// │ │ B │ │ 6 │ │ │ D │ run length of 'D' = keys[1] - keys[0] = 1 +/// │ │ B │ │ 6 │ │ │ D │ run length of 'D' = run_ends[1] - run_ends[0] = 1 /// └─────────────────┘ └─────────┘ ├─────────────────┤ /// │ values run_ends │ │ B │ /// ├─────────────────┤ /// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘ │ B │ /// ├─────────────────┤ -/// RunEndEncodedArray │ B │ run length of 'B' = keys[2] - keys[1] = 3 +/// RunEndEncodedArray │ B │ run length of 'B' = run_ends[2] - run_ends[1] = 3 /// length = 3 └─────────────────┘ /// /// Logical array @@ -236,6 +236,7 @@ impl<'a, T: ArrowRunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray Result<(), ArrowError> { if self.current_value.is_some() { @@ -172,34 +174,36 @@ where .ok_or(ArrowError::RunEndIndexOverflowError)?; Ok(()) } + /// Creates the RunEndEncodedArray and resets the builder. /// Panics if RunEndEncodedArray cannot be built. pub fn finish(&mut self) -> RunEndEncodedArray { - //write the last run end to the array. + // write the last run end to the array. self.append_run_end().unwrap(); - //reset the run end index to zero. + // reset the run end index to zero. self.current_value = None; self.current_run_end_index = 0; - //build the run encoded array by adding run_ends and values array as its children. + // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish(); let values_array = self.values_builder.finish(); RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() } + /// Creates the RunEndEncodedArray and without resetting the builder. /// Panics if RunEndEncodedArray cannot be built. pub fn finish_cloned(&mut self) -> RunEndEncodedArray { - //write the last run end to the array. + // write the last run end to the array. self.append_run_end().unwrap(); - //build the run encoded array by adding run_ends and values array as its children. + // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish_cloned(); let values_array = self.values_builder.finish_cloned(); RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() } - //Appends the current run to the array + // Appends the current run to the array fn append_run_end(&mut self) -> Result<(), ArrowError> { let run_end_index = R::Native::from_usize(self.current_run_end_index) .ok_or_else(|| { diff --git a/arrow-array/src/builder/primitive_ree_array_builder.rs b/arrow-array/src/builder/primitive_ree_array_builder.rs index 7bb6ddb06cf7..6001d4be274a 100644 --- a/arrow-array/src/builder/primitive_ree_array_builder.rs +++ b/arrow-array/src/builder/primitive_ree_array_builder.rs @@ -132,42 +132,46 @@ where Ok(()) } + /// Appends value to the logical array encoded by the run-ends array. pub fn append_value(&mut self, value: V::Native) -> Result<(), ArrowError> { self.append_option(Some(value)) } + /// Appends null to the logical array encoded by the run-ends array. pub fn append_null(&mut self) -> Result<(), ArrowError> { self.append_option(None) } + /// Creates the RunEndEncodedArray and resets the builder. /// Panics if RunEndEncodedArray cannot be built. pub fn finish(&mut self) -> RunEndEncodedArray { - //write the last run end to the array. + // write the last run end to the array. self.append_run_end().unwrap(); - //reset the run index to zero. + // reset the run index to zero. self.current_value = None; self.current_run_end_index = 0; - //build the run encoded array by adding run_ends and values array as its children. + // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish(); let values_array = self.values_builder.finish(); RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() } + /// Creates the RunEndEncodedArray and without resetting the builder. /// Panics if RunEndEncodedArray cannot be built. pub fn finish_cloned(&mut self) -> RunEndEncodedArray { - //write the last run end to the array. + // write the last run end to the array. self.append_run_end().unwrap(); - //build the run encoded array by adding run_ends and values array as its children. + // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish_cloned(); let values_array = self.values_builder.finish_cloned(); RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() } - //Appends the current run to the array + // Appends the current run to the array fn append_run_end(&mut self) -> Result<(), ArrowError> { let run_end_index = R::Native::from_usize(self.current_run_end_index) .ok_or_else(|| { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index a720e17f35b4..9a54e9ae9a69 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1493,7 +1493,7 @@ impl ArrayData { }) } - /// Validates that each value in run_ends array is posittive and strictly increasing. + /// Validates that each value in run_ends array is positive and strictly increasing. fn check_run_ends(&self) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, From d5c1089bd3b05bea7544ab7bfc475fc129805a82 Mon Sep 17 00:00:00 2001 From: ask Date: Wed, 18 Jan 2023 15:39:04 -0500 Subject: [PATCH 07/20] seal ArrowRunEndIndexType per PR suggestion --- arrow-array/src/types.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 8f7dfed34808..f8dcfaaf35e2 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -32,6 +32,10 @@ use half::f16; use std::marker::PhantomData; use std::ops::{Add, Sub}; +mod private { + pub trait Sealed {} +} + // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` // operation assumes bit-packing. /// A boolean datatype @@ -243,12 +247,17 @@ impl ArrowDictionaryKeyType for UInt64Type {} /// A subtype of primitive type that is used as run-ends index /// in RunEndEncodedArray. /// See -pub trait ArrowRunEndIndexType: ArrowPrimitiveType {} +/// +/// # Sealed: The implementation of this trait is sealed to avoid accidental misuse. +pub trait ArrowRunEndIndexType: ArrowPrimitiveType + private::Sealed {} +impl private::Sealed for Int16Type {} impl ArrowRunEndIndexType for Int16Type {} +impl private::Sealed for Int32Type {} impl ArrowRunEndIndexType for Int32Type {} +impl private::Sealed for Int64Type {} impl ArrowRunEndIndexType for Int64Type {} /// A subtype of primitive type that represents temporal values. From 74806a196b7f948dd2cea920a8ad0865c4eb9a87 Mon Sep 17 00:00:00 2001 From: ask Date: Sun, 22 Jan 2023 14:21:22 -0500 Subject: [PATCH 08/20] Fix PR suggestions --- .../src/array/run_end_encoded_array.rs | 69 ++++++++----------- .../builder/generic_byte_ree_array_builder.rs | 4 +- .../builder/primitive_ree_array_builder.rs | 10 +-- arrow-array/src/types.rs | 29 ++++---- 4 files changed, 51 insertions(+), 61 deletions(-) diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs index fb95c6f2ab1e..8f869aad3416 100644 --- a/arrow-array/src/array/run_end_encoded_array.rs +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -23,18 +23,18 @@ use arrow_schema::{ArrowError, DataType, Field}; use crate::{ builder::StringREEArrayBuilder, make_array, - types::{ArrowRunEndIndexType, Int16Type, Int32Type, Int64Type}, + types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, Array, ArrayRef, PrimitiveArray, }; /// /// A run-end encoding (REE) is a variation of [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding). -/// This encoding is good for representing data containing same values repeated consecutively -/// called runs. Each run is represented by the value of data and the index at which the run ends. /// -/// [`RunEndEncodedArray`] has `run_ends` array and `values` array of same length. +/// This encoding is good for representing data containing same values repeated consecutively. +/// +/// [`RunEndEncodedArray`] contains `run_ends` array and `values` array of same length. /// The `run_ends` array stores the indexes at which the run ends. The `values` array -/// stores the value of the run. Below example illustrates how a logical array is represented in +/// stores the value of each run. Below example illustrates how a logical array is represented in /// [`RunEndEncodedArray`] /// /// @@ -58,13 +58,13 @@ use crate::{ /// Contents /// ``` -pub struct RunEndEncodedArray { +pub struct RunEndEncodedArray { data: ArrayData, run_ends: PrimitiveArray, values: ArrayRef, } -impl RunEndEncodedArray { +impl RunEndEncodedArray { /// Attempts to create RunEndEncodedArray using given run_ends (index where a run ends) /// and the values (value of the run). Returns an error if the given data is not compatible /// with RunEndEncoded specification. @@ -86,9 +86,11 @@ impl RunEndEncodedArray { let array_data = unsafe { builder.build_unchecked() }; // Safety: `validate_data` checks below - // 1. run_ends array does not have null values - // 2. run_ends array has non-zero and strictly increasing values. - // 3. The length of run_ends array and values array are the same. + // 1. The given array data has exactly two child arrays. + // 2. The first child array (run_ends) has valid data type. + // 3. run_ends array does not have null values + // 4. run_ends array has non-zero and strictly increasing values. + // 5. The length of run_ends array and values array are the same. array_data.validate_data()?; Ok(array_data.into()) @@ -104,7 +106,7 @@ impl RunEndEncodedArray { } } -impl From for RunEndEncodedArray { +impl From for RunEndEncodedArray { fn from(data: ArrayData) -> Self { match data.data_type() { DataType::RunEndEncoded(run_ends_data_type, _) => { @@ -121,14 +123,6 @@ impl From for RunEndEncodedArray { } } - // Safety: `validate_data` checks below - // 1. The given array data has exactly two child arrays. - // 2. The first child array (run_ends) has valid data type. - // 3. run_ends array does not have null values - // 4. run_ends array has non-zero and strictly increasing values. - // 5. The length of run_ends array and values array are the same. - data.validate_data().unwrap(); - let run_ends = PrimitiveArray::::from(data.child_data()[0].clone()); let values = make_array(data.child_data()[1].clone()); Self { @@ -139,13 +133,13 @@ impl From for RunEndEncodedArray { } } -impl From> for ArrayData { +impl From> for ArrayData { fn from(array: RunEndEncodedArray) -> Self { array.data } } -impl Array for RunEndEncodedArray { +impl Array for RunEndEncodedArray { fn as_any(&self) -> &dyn Any { self } @@ -159,7 +153,7 @@ impl Array for RunEndEncodedArray { } } -impl std::fmt::Debug for RunEndEncodedArray { +impl std::fmt::Debug for RunEndEncodedArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!( f, @@ -185,9 +179,7 @@ impl std::fmt::Debug for RunEndEncodedArray { /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: ArrowRunEndIndexType> FromIterator> - for RunEndEncodedArray -{ +impl<'a, T: RunEndIndexType> FromIterator> for RunEndEncodedArray { fn from_iter>>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -222,7 +214,7 @@ impl<'a, T: ArrowRunEndIndexType> FromIterator> /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: ArrowRunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray { +impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray { fn from_iter>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -288,30 +280,23 @@ mod tests { use super::*; use crate::builder::PrimitiveREEArrayBuilder; - use crate::types::{Int16Type, Int32Type, UInt32Type}; + use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int16Array, Int32Array, StringArray}; - use arrow_buffer::{Buffer, ToByteSlice}; use arrow_schema::Field; #[test] fn test_ree_array() { // Construct a value array - let value_data = ArrayData::builder(DataType::Int8) - .len(8) - .add_buffer(Buffer::from( - &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(), - )) - .build() - .unwrap(); + let value_data = PrimitiveArray::::from_iter_values([ + 10_i8, 11, 12, 13, 14, 15, 16, 17, + ]) + .into_data(); // Construct a run_ends array: - let run_ends_data = ArrayData::builder(DataType::Int16) - .len(8) - .add_buffer(Buffer::from( - &[4_i16, 6, 7, 9, 13, 18, 20, 22].to_byte_slice(), - )) - .build() - .unwrap(); + let run_ends_data = PrimitiveArray::::from_iter_values([ + 4_i16, 6, 7, 9, 13, 18, 20, 22, + ]) + .into_data(); // Construct a run ends encoded array from the above two let run_ends_type = Field::new("run_ends", DataType::Int16, false); diff --git a/arrow-array/src/builder/generic_byte_ree_array_builder.rs b/arrow-array/src/builder/generic_byte_ree_array_builder.rs index 39116e0750a4..4adcad243ee1 100644 --- a/arrow-array/src/builder/generic_byte_ree_array_builder.rs +++ b/arrow-array/src/builder/generic_byte_ree_array_builder.rs @@ -17,7 +17,7 @@ use crate::{ types::{ - ArrowRunEndIndexType, BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, + BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type, }, ArrowPrimitiveType, RunEndEncodedArray, @@ -120,7 +120,7 @@ where impl GenericByteREEArrayBuilder where - R: ArrowRunEndIndexType, + R: RunEndIndexType, V: ByteArrayType, { /// Appends optional value to the logical array encoded by the RunEndEncodedArray. diff --git a/arrow-array/src/builder/primitive_ree_array_builder.rs b/arrow-array/src/builder/primitive_ree_array_builder.rs index 6001d4be274a..8660b99f97e9 100644 --- a/arrow-array/src/builder/primitive_ree_array_builder.rs +++ b/arrow-array/src/builder/primitive_ree_array_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{types::ArrowRunEndIndexType, ArrowPrimitiveType, RunEndEncodedArray}; +use crate::{types::RunEndIndexType, ArrowPrimitiveType, RunEndEncodedArray}; use super::PrimitiveBuilder; @@ -61,7 +61,7 @@ use arrow_schema::ArrowError; #[derive(Debug)] pub struct PrimitiveREEArrayBuilder where - R: ArrowRunEndIndexType, + R: RunEndIndexType, V: ArrowPrimitiveType, { run_ends_builder: PrimitiveBuilder, @@ -72,7 +72,7 @@ where impl Default for PrimitiveREEArrayBuilder where - R: ArrowRunEndIndexType, + R: RunEndIndexType, V: ArrowPrimitiveType, { fn default() -> Self { @@ -82,7 +82,7 @@ where impl PrimitiveREEArrayBuilder where - R: ArrowRunEndIndexType, + R: RunEndIndexType, V: ArrowPrimitiveType, { /// Creates a new `PrimitiveREEArrayBuilder` @@ -110,7 +110,7 @@ where impl PrimitiveREEArrayBuilder where - R: ArrowRunEndIndexType, + R: RunEndIndexType, V: ArrowPrimitiveType, { /// Appends optional value to the logical array encoded by the RunEndEncodedArray. diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index f8dcfaaf35e2..e9c1b8305346 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -32,10 +32,6 @@ use half::f16; use std::marker::PhantomData; use std::ops::{Add, Sub}; -mod private { - pub trait Sealed {} -} - // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` // operation assumes bit-packing. /// A boolean datatype @@ -244,21 +240,30 @@ impl ArrowDictionaryKeyType for UInt32Type {} impl ArrowDictionaryKeyType for UInt64Type {} +mod run { + use super::*; + + pub trait RunEndTypeSealed {} + + impl RunEndTypeSealed for Int16Type {} + + impl RunEndTypeSealed for Int32Type {} + + impl RunEndTypeSealed for Int64Type {} +} + /// A subtype of primitive type that is used as run-ends index /// in RunEndEncodedArray. /// See /// -/// # Sealed: The implementation of this trait is sealed to avoid accidental misuse. -pub trait ArrowRunEndIndexType: ArrowPrimitiveType + private::Sealed {} +/// Note: The implementation of this trait is sealed to avoid accidental misuse. +pub trait RunEndIndexType: ArrowPrimitiveType + run::RunEndTypeSealed {} -impl private::Sealed for Int16Type {} -impl ArrowRunEndIndexType for Int16Type {} +impl RunEndIndexType for Int16Type {} -impl private::Sealed for Int32Type {} -impl ArrowRunEndIndexType for Int32Type {} +impl RunEndIndexType for Int32Type {} -impl private::Sealed for Int64Type {} -impl ArrowRunEndIndexType for Int64Type {} +impl RunEndIndexType for Int64Type {} /// A subtype of primitive type that represents temporal values. pub trait ArrowTemporalType: ArrowPrimitiveType {} From 353d72d2bb3d55fd18878db96d760df6451eeffb Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 06:30:09 -0500 Subject: [PATCH 09/20] few more PR coments --- arrow-array/src/array/mod.rs | 6 +- .../src/array/run_end_encoded_array.rs | 170 ++++++------ ...builder.rs => generic_byte_run_builder.rs} | 257 ++++++++++++------ .../src/builder/generic_bytes_builder.rs | 9 - arrow-array/src/builder/mod.rs | 8 +- ...ay_builder.rs => primitive_run_builder.rs} | 142 ++++++++-- arrow-array/src/types.rs | 2 +- arrow-data/src/data.rs | 4 +- arrow-schema/src/datatype.rs | 4 +- 9 files changed, 384 insertions(+), 218 deletions(-) rename arrow-array/src/builder/{generic_byte_ree_array_builder.rs => generic_byte_run_builder.rs} (56%) rename arrow-array/src/builder/{primitive_ree_array_builder.rs => primitive_run_builder.rs} (57%) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index bda5bbdd97b1..df74fcf128b0 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -585,13 +585,13 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::RunEndEncoded(ref run_ends_type, _) => { match run_ends_type.data_type() { DataType::Int16 => { - Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + Arc::new(RunArray::::from(data)) as ArrayRef } DataType::Int32 => { - Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + Arc::new(RunArray::::from(data)) as ArrayRef } DataType::Int64 => { - Arc::new(RunEndEncodedArray::::from(data)) as ArrayRef + Arc::new(RunArray::::from(data)) as ArrayRef } dt => panic!("Unexpected data type for run_ends array {:?}", dt), } diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_end_encoded_array.rs index 8f869aad3416..9c7811348771 100644 --- a/arrow-array/src/array/run_end_encoded_array.rs +++ b/arrow-array/src/array/run_end_encoded_array.rs @@ -21,7 +21,7 @@ use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use crate::{ - builder::StringREEArrayBuilder, + builder::StringRunBuilder, make_array, types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, Array, ArrayRef, PrimitiveArray, @@ -32,10 +32,10 @@ use crate::{ /// /// This encoding is good for representing data containing same values repeated consecutively. /// -/// [`RunEndEncodedArray`] contains `run_ends` array and `values` array of same length. +/// [`RunArray`] contains `run_ends` array and `values` array of same length. /// The `run_ends` array stores the indexes at which the run ends. The `values` array /// stores the value of each run. Below example illustrates how a logical array is represented in -/// [`RunEndEncodedArray`] +/// [`RunArray`] /// /// /// ```text @@ -51,21 +51,21 @@ use crate::{ /// ├─────────────────┤ /// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘ │ B │ /// ├─────────────────┤ -/// RunEndEncodedArray │ B │ run length of 'B' = run_ends[2] - run_ends[1] = 3 +/// RunArray │ B │ run length of 'B' = run_ends[2] - run_ends[1] = 3 /// length = 3 └─────────────────┘ /// /// Logical array /// Contents /// ``` -pub struct RunEndEncodedArray { +pub struct RunArray { data: ArrayData, run_ends: PrimitiveArray, values: ArrayRef, } -impl RunEndEncodedArray { - /// Attempts to create RunEndEncodedArray using given run_ends (index where a run ends) +impl RunArray { + /// Attempts to create RunArray using given run_ends (index where a run ends) /// and the values (value of the run). Returns an error if the given data is not compatible /// with RunEndEncoded specification. pub fn try_new( @@ -106,20 +106,13 @@ impl RunEndEncodedArray { } } -impl From for RunEndEncodedArray { +impl From for RunArray { + // The method assumes the caller already validated the data using `ArrayData::validate_data()` fn from(data: ArrayData) -> Self { match data.data_type() { - DataType::RunEndEncoded(run_ends_data_type, _) => { - assert_eq!( - &R::DATA_TYPE, - run_ends_data_type.data_type(), - "Data type mismatch for run_ends array, expected {} got {}", - R::DATA_TYPE, - run_ends_data_type.data_type() - ); - } + DataType::RunEndEncoded(_, _) => {} _ => { - panic!("Invalid data type for RunEndEncodedArray. The data type should be DataType::RunEndEncoded"); + panic!("Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"); } } @@ -133,13 +126,13 @@ impl From for RunEndEncodedArray { } } -impl From> for ArrayData { - fn from(array: RunEndEncodedArray) -> Self { +impl From> for ArrayData { + fn from(array: RunArray) -> Self { array.data } } -impl Array for RunEndEncodedArray { +impl Array for RunArray { fn as_any(&self) -> &dyn Any { self } @@ -153,37 +146,37 @@ impl Array for RunEndEncodedArray { } } -impl std::fmt::Debug for RunEndEncodedArray { +impl std::fmt::Debug for RunArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!( f, - "RunEndEncodedArray {{run_ends: {:?}, values: {:?}}}", + "RunArray {{run_ends: {:?}, values: {:?}}}", self.run_ends, self.values ) } } -/// Constructs a `RunEndEncodedArray` from an iterator of optional strings. +/// Constructs a `RunArray` from an iterator of optional strings. /// /// # Example: /// ``` -/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int16Type}; +/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; /// /// let test = vec!["a", "a", "b", "c", "c"]; -/// let array: RunEndEncodedArray = test +/// let array: RunArray = test /// .iter() /// .map(|&x| if x == "b" { None } else { Some(x) }) /// .collect(); /// assert_eq!( -/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", +/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: RunEndIndexType> FromIterator> for RunEndEncodedArray { +impl<'a, T: RunEndIndexType> FromIterator> for RunArray { fn from_iter>>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); - let mut builder = StringREEArrayBuilder::with_capacity(lower, 256); + let mut builder = StringRunBuilder::with_capacity(lower, 256); it.for_each(|i| { if let Some(i) = i { builder @@ -200,25 +193,25 @@ impl<'a, T: RunEndIndexType> FromIterator> for RunEndEncodedArra } } -/// Constructs a `RunEndEncodedArray` from an iterator of strings. +/// Constructs a `RunArray` from an iterator of strings. /// /// # Example: /// /// ``` -/// use arrow_array::{RunEndEncodedArray, PrimitiveArray, StringArray, types::Int16Type}; +/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; /// /// let test = vec!["a", "a", "b", "c"]; -/// let array: RunEndEncodedArray = test.into_iter().collect(); +/// let array: RunArray = test.into_iter().collect(); /// assert_eq!( -/// "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray { +impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { fn from_iter>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); - let mut builder = StringREEArrayBuilder::with_capacity(lower, 256); + let mut builder = StringRunBuilder::with_capacity(lower, 256); it.for_each(|i| { builder .append_value(i) @@ -230,62 +223,62 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunEndEncodedArray { } /// -/// A [`RunEndEncodedArray`] array where run ends are stored using `i16` data type. +/// A [`RunArray`] array where run ends are stored using `i16` data type. /// /// # Example: Using `collect` /// ``` -/// # use arrow_array::{Array, Int16RunEndEncodedArray, Int16Array, StringArray}; +/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray}; /// # use std::sync::Arc; /// -/// let array: Int16RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); /// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); /// assert_eq!(array.values(), &values); /// ``` -pub type Int16RunEndEncodedArray = RunEndEncodedArray; +pub type Int16RunArray = RunArray; /// -/// A [`RunEndEncodedArray`] array where run ends are stored using `i32` data type. +/// A [`RunArray`] array where run ends are stored using `i32` data type. /// /// # Example: Using `collect` /// ``` -/// # use arrow_array::{Array, Int32RunEndEncodedArray, Int32Array, StringArray}; +/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray}; /// # use std::sync::Arc; /// -/// let array: Int32RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); /// assert_eq!(array.run_ends(), &Int32Array::from(vec![2, 3, 5])); /// assert_eq!(array.values(), &values); /// ``` -pub type Int32RunEndEncodedArray = RunEndEncodedArray; +pub type Int32RunArray = RunArray; /// -/// A [`RunEndEncodedArray`] array where run ends are stored using `i64` data type. +/// A [`RunArray`] array where run ends are stored using `i64` data type. /// /// # Example: Using `collect` /// ``` -/// # use arrow_array::{Array, Int64RunEndEncodedArray, Int64Array, StringArray}; +/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray}; /// # use std::sync::Arc; /// -/// let array: Int64RunEndEncodedArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); /// assert_eq!(array.run_ends(), &Int64Array::from(vec![2, 3, 5])); /// assert_eq!(array.values(), &values); /// ``` -pub type Int64RunEndEncodedArray = RunEndEncodedArray; +pub type Int64RunArray = RunArray; #[cfg(test)] mod tests { use std::sync::Arc; use super::*; - use crate::builder::PrimitiveREEArrayBuilder; + use crate::builder::PrimitiveRunBuilder; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int16Array, Int32Array, StringArray}; use arrow_schema::Field; #[test] - fn test_ree_array() { + fn test_run_array() { // Construct a value array let value_data = PrimitiveArray::::from_iter_values([ 10_i8, 11, 12, 13, 14, 15, 16, 17, @@ -308,7 +301,7 @@ mod tests { .add_child_data(value_data.clone()) .build() .unwrap(); - let ree_array = Int16RunEndEncodedArray::from(dict_data); + let ree_array = Int16RunArray::from(dict_data); let values = ree_array.values(); assert_eq!(&value_data, values.data()); @@ -320,53 +313,51 @@ mod tests { } #[test] - fn test_ree_array_fmt_debug() { - let mut builder = - PrimitiveREEArrayBuilder::::with_capacity(3); + fn test_run_array_fmt_debug() { + let mut builder = PrimitiveRunBuilder::::with_capacity(3); builder.append_value(12345678).unwrap(); builder.append_null().unwrap(); builder.append_value(22345678).unwrap(); let array = builder.finish(); assert_eq!( - "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", + "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", format!("{:?}", array) ); - let mut builder = - PrimitiveREEArrayBuilder::::with_capacity(20); + let mut builder = PrimitiveRunBuilder::::with_capacity(20); for _ in 0..20 { builder.append_value(1).unwrap(); } let array = builder.finish(); assert_eq!( - "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", + "RunArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", format!("{:?}", array) ); } #[test] - fn test_ree_array_from_iter() { + fn test_run_array_from_iter() { let test = vec!["a", "a", "b", "c"]; - let array: RunEndEncodedArray = test + let array: RunArray = test .iter() .map(|&x| if x == "b" { None } else { Some(x) }) .collect(); assert_eq!( - "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", + "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", format!("{:?}", array) ); - let array: RunEndEncodedArray = test.into_iter().collect(); + let array: RunArray = test.into_iter().collect(); assert_eq!( - "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", + "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", format!("{:?}", array) ); } #[test] - fn test_ree_array_run_ends_as_primitive_array() { + fn test_run_array_run_ends_as_primitive_array() { let test = vec!["a", "b", "c", "a"]; - let array: RunEndEncodedArray = test.into_iter().collect(); + let array: RunArray = test.into_iter().collect(); let run_ends = array.run_ends(); assert_eq!(&DataType::Int16, run_ends.data_type()); @@ -375,9 +366,9 @@ mod tests { } #[test] - fn test_ree_array_as_primitive_array_with_null() { + fn test_run_array_as_primitive_array_with_null() { let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; - let array: RunEndEncodedArray = test.into_iter().collect(); + let array: RunArray = test.into_iter().collect(); let run_ends = array.run_ends(); assert_eq!(&DataType::Int32, run_ends.data_type()); @@ -391,9 +382,9 @@ mod tests { } #[test] - fn test_ree_array_all_nulls() { + fn test_run_array_all_nulls() { let test = vec![None, None, None]; - let array: RunEndEncodedArray = test.into_iter().collect(); + let array: RunArray = test.into_iter().collect(); let run_ends = array.run_ends(); assert_eq!(1, run_ends.len()); @@ -404,14 +395,14 @@ mod tests { } #[test] - fn test_ree_array_try_new() { + fn test_run_array_try_new() { let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] .into_iter() .collect(); let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); - let array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + let array = RunArray::::try_new(&run_ends, &values).unwrap(); assert_eq!(array.run_ends().data_type(), &DataType::Int32); assert_eq!(array.values().data_type(), &DataType::Utf8); @@ -419,28 +410,35 @@ mod tests { assert_eq!(array.values().null_count(), 1); assert_eq!( - "RunEndEncodedArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", + "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", format!("{:?}", array) ); } #[test] - fn test_ree_array_int16_type_definition() { - let array: Int16RunEndEncodedArray = - vec!["a", "a", "b", "c", "c"].into_iter().collect(); + fn test_run_array_int16_type_definition() { + let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); assert_eq!(array.values(), &values); } #[test] - fn test_ree_array_length_mismatch() { + fn test_run_array_empty_string() { + let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect(); + let values: Arc = Arc::new(StringArray::from(vec!["a", "", "c"])); + assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 4, 5])); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_run_array_length_mismatch() { let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] .into_iter() .collect(); let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect(); - let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string()); assert_eq!( format!("{}", expected), @@ -449,13 +447,13 @@ mod tests { } #[test] - fn test_ree_array_run_ends_with_null() { + fn test_run_array_run_ends_with_null() { let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] .into_iter() .collect(); let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); - let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); assert_eq!( format!("{}", expected), @@ -464,13 +462,13 @@ mod tests { } #[test] - fn test_ree_array_run_ends_with_zeroes() { + fn test_run_array_run_ends_with_zeroes() { let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] .into_iter() .collect(); let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect(); - let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string()); assert_eq!( format!("{}", expected), @@ -479,13 +477,13 @@ mod tests { } #[test] - fn test_ree_array_run_ends_non_increasing() { + fn test_run_array_run_ends_non_increasing() { let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] .into_iter() .collect(); let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect(); - let actual = RunEndEncodedArray::::try_new(&run_ends, &values); + let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string()); assert_eq!( format!("{}", expected), @@ -495,10 +493,10 @@ mod tests { #[test] #[should_panic( - expected = "Data type mismatch for run_ends array, expected Int64 got Int32" + expected = "PrimitiveArray expected ArrayData with type Int64 got Int32" )] - fn test_ree_array_run_ends_data_type_mismatch() { - let a = RunEndEncodedArray::::from_iter(["32"]); - let _ = RunEndEncodedArray::::from(a.into_data()); + fn test_run_array_run_ends_data_type_mismatch() { + let a = RunArray::::from_iter(["32"]); + let _ = RunArray::::from(a.into_data()); } } diff --git a/arrow-array/src/builder/generic_byte_ree_array_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs similarity index 56% rename from arrow-array/src/builder/generic_byte_ree_array_builder.rs rename to arrow-array/src/builder/generic_byte_run_builder.rs index 4adcad243ee1..c1b6c69de026 100644 --- a/arrow-array/src/builder/generic_byte_ree_array_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -15,32 +15,35 @@ // specific language governing permissions and limitations // under the License. +use crate::types::bytes::ByteArrayNativeType; +use std::{any::Any, sync::Arc}; + use crate::{ types::{ BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type, }, - ArrowPrimitiveType, RunEndEncodedArray, + ArrayRef, ArrowPrimitiveType, RunArray, }; -use super::{GenericByteBuilder, PrimitiveBuilder}; +use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; -/// Array builder for [`RunEndEncodedArray`] for String and Binary types. +/// Array builder for [`RunArray`] for String and Binary types. /// /// # Example: /// /// ``` /// -/// # use arrow_array::builder::GenericByteREEArrayBuilder; +/// # use arrow_array::builder::GenericByteRunBuilder; /// # use arrow_array::{GenericByteArray, BinaryArray}; /// # use arrow_array::types::{BinaryType, Int16Type}; /// # use arrow_array::{Array, Int16Array}; /// /// let mut builder = -/// GenericByteREEArrayBuilder::::new(); +/// GenericByteRunBuilder::::new(); /// builder.append_value(b"abc").unwrap(); /// builder.append_value(b"abc").unwrap(); /// builder.append_null().unwrap(); @@ -65,18 +68,20 @@ use arrow_schema::ArrowError; /// assert_eq!(ava.value(2), b"def"); /// ``` #[derive(Debug)] -pub struct GenericByteREEArrayBuilder +pub struct GenericByteRunBuilder where R: ArrowPrimitiveType, V: ByteArrayType, { run_ends_builder: PrimitiveBuilder, values_builder: GenericByteBuilder, - current_value: Option>, + current_value: Vec, + has_current_value: bool, current_run_end_index: usize, + prev_run_end_index: usize, } -impl Default for GenericByteREEArrayBuilder +impl Default for GenericByteRunBuilder where R: ArrowPrimitiveType, V: ByteArrayType, @@ -86,22 +91,24 @@ where } } -impl GenericByteREEArrayBuilder +impl GenericByteRunBuilder where R: ArrowPrimitiveType, V: ByteArrayType, { - /// Creates a new `GenericByteREEArrayBuilder` + /// Creates a new `GenericByteRunBuilder` pub fn new() -> Self { Self { run_ends_builder: PrimitiveBuilder::new(), values_builder: GenericByteBuilder::::new(), - current_value: None, + current_value: Vec::new(), + has_current_value: false, current_run_end_index: 0, + prev_run_end_index: 0, } } - /// Creates a new `GenericByteREEArrayBuilder` with the provided capacity + /// Creates a new `GenericByteRunBuilder` with the provided capacity /// /// `capacity`: the expected number of run-end encoded values. /// `data_capacity`: the expected number of bytes of run end encoded values @@ -112,18 +119,66 @@ where capacity, data_capacity, ), - current_value: None, + current_value: Vec::new(), + has_current_value: false, current_run_end_index: 0, + prev_run_end_index: 0, } } } -impl GenericByteREEArrayBuilder +impl ArrayBuilder for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + let mut len = self.run_ends_builder.len(); + // If there is an ongoing run yet to be added, include it in the len + if self.prev_run_end_index != self.current_run_end_index { + len += 1; + } + len + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.current_run_end_index == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericByteRunBuilder where R: RunEndIndexType, V: ByteArrayType, { - /// Appends optional value to the logical array encoded by the RunEndEncodedArray. + /// Appends optional value to the logical array encoded by the RunArray. pub fn append_option( &mut self, input_value: Option>, @@ -135,25 +190,20 @@ where Ok(()) } - /// Appends value to the logical array encoded by the RunEndEncodedArray. + /// Appends value to the logical array encoded by the RunArray. pub fn append_value( &mut self, input_value: impl AsRef, ) -> Result<(), ArrowError> { let value: &[u8] = input_value.as_ref().as_ref(); - match self.current_value.as_deref() { - None if self.current_run_end_index > 0 => { - self.append_run_end()?; - self.current_value = Some(value.to_owned()); - } - None if self.current_run_end_index == 0 => { - self.current_value = Some(value.to_owned()); - } - Some(current_value) if current_value != value => { - self.append_run_end()?; - self.current_value = Some(value.to_owned()); - } - _ => {} + if !self.has_current_value { + self.append_run_end()?; + self.current_value.extend_from_slice(value); + self.has_current_value = true; + } else if self.current_value.as_slice() != value { + self.append_run_end()?; + self.current_value.clear(); + self.current_value.extend_from_slice(value); } self.current_run_end_index = self .current_run_end_index @@ -162,11 +212,12 @@ where Ok(()) } - /// Appends null to the logical array encoded by the RunEndEncodedArray. + /// Appends null to the logical array encoded by the RunArray. pub fn append_null(&mut self) -> Result<(), ArrowError> { - if self.current_value.is_some() { + if self.has_current_value { self.append_run_end()?; - self.current_value = None; + self.current_value.clear(); + self.has_current_value = false; } self.current_run_end_index = self .current_run_end_index @@ -175,36 +226,51 @@ where Ok(()) } - /// Creates the RunEndEncodedArray and resets the builder. - /// Panics if RunEndEncodedArray cannot be built. - pub fn finish(&mut self) -> RunEndEncodedArray { + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { // write the last run end to the array. self.append_run_end().unwrap(); // reset the run end index to zero. - self.current_value = None; + self.current_value.clear(); + self.has_current_value = false; self.current_run_end_index = 0; + self.prev_run_end_index = 0; // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish(); let values_array = self.values_builder.finish(); - RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + RunArray::::try_new(&run_ends_array, &values_array).unwrap() } - /// Creates the RunEndEncodedArray and without resetting the builder. - /// Panics if RunEndEncodedArray cannot be built. - pub fn finish_cloned(&mut self) -> RunEndEncodedArray { - // write the last run end to the array. - self.append_run_end().unwrap(); + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); + + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder) + .unwrap(); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } - // build the run encoded array by adding run_ends and values array as its children. - let run_ends_array = self.run_ends_builder.finish_cloned(); - let values_array = self.values_builder.finish_cloned(); - RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + RunArray::::try_new(&run_ends_array, &values_array).unwrap() } - // Appends the current run to the array + // Appends the current run to the array. fn append_run_end(&mut self) -> Result<(), ArrowError> { + // empty array or the function called without appending any value. + if self.current_run_end_index == 0 + || self.prev_run_end_index == self.current_run_end_index + { + return Ok(()); + } let run_end_index = R::Native::from_usize(self.current_run_end_index) .ok_or_else(|| { ArrowError::ParseError(format!( @@ -214,25 +280,54 @@ where )) })?; self.run_ends_builder.append_value(run_end_index); - match self.current_value.as_deref() { - Some(value) => self.values_builder.append_slice(value), - None => self.values_builder.append_null(), + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { V::Native::from_bytes_unchecked(slice) }; + self.values_builder.append_value(native); + } else { + self.values_builder.append_null(); + } + self.prev_run_end_index = self.current_run_end_index; + Ok(()) + } + + // Similar to `append_run_end` but on custom builders. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut GenericByteBuilder, + ) -> Result<(), ArrowError> { + let run_end_index = R::Native::from_usize(self.current_run_end_index) + .ok_or_else(|| { + ArrowError::ParseError(format!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + })?; + run_ends_builder.append_value(run_end_index); + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { V::Native::from_bytes_unchecked(slice) }; + values_builder.append_value(native); + } else { + values_builder.append_null(); } Ok(()) } } -/// Array builder for [`RunEndEncodedArray`] that encodes strings ([`Utf8Type`]). +/// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. /// // The encoded values are Strings. /// -/// # use arrow_array::builder::StringREEArrayBuilder; +/// # use arrow_array::builder::StringRunBuilder; /// # use arrow_array::{Int16Array, StringArray}; /// # use arrow_array::types::Int16Type; /// -/// let mut builder = StringREEArrayBuilder::::new(); +/// let mut builder = StringRunBuilder::::new(); /// /// // The builder builds the dictionary value by value /// builder.append_value("abc").unwrap(); @@ -257,22 +352,22 @@ where /// assert_eq!(ava.value(3), "abc"); /// /// ``` -pub type StringREEArrayBuilder = GenericByteREEArrayBuilder; +pub type StringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunEndEncodedArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringREEArrayBuilder`] for an example. -pub type LargeStringREEArrayBuilder = GenericByteREEArrayBuilder; +/// Array builder for [`RunArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringRunBuilder`] for an example. +pub type LargeStringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunEndEncodedArray`] that encodes binary values([`BinaryType`]). +/// Array builder for [`RunArray`] that encodes binary values([`BinaryType`]). /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. /// // The encoded data is binary values. /// -/// # use arrow_array::builder::BinaryREEArrayBuilder; +/// # use arrow_array::builder::BinaryRunBuilder; /// # use arrow_array::{BinaryArray, Int16Array}; /// # use arrow_array::types::Int16Type; /// -/// let mut builder = BinaryREEArrayBuilder::::new(); +/// let mut builder = BinaryRunBuilder::::new(); /// /// // The builder builds the dictionary value by value /// builder.append_value(b"abc").unwrap(); @@ -297,11 +392,11 @@ pub type LargeStringREEArrayBuilder = GenericByteREEArrayBuilder = GenericByteREEArrayBuilder; +pub type BinaryRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunEndEncodedArray`] that encodes large binary values([`LargeBinaryType`]). -/// See documentation of [`BinaryREEArrayBuilder`] for an example. -pub type LargeBinaryREEArrayBuilder = GenericByteREEArrayBuilder; +/// Array builder for [`RunArray`] that encodes large binary values([`LargeBinaryType`]). +/// See documentation of [`BinaryRunBuilder`] for an example. +pub type LargeBinaryRunBuilder = GenericByteRunBuilder; #[cfg(test)] mod tests { @@ -311,14 +406,15 @@ mod tests { use crate::types::Int16Type; use crate::GenericByteArray; use crate::Int16Array; + use crate::Int16RunArray; - fn test_bytes_ree_array_buider(values: Vec<&T::Native>) + fn test_bytes_run_buider(values: Vec<&T::Native>) where T: ByteArrayType, ::Native: PartialEq, ::Native: AsRef<::Native>, { - let mut builder = GenericByteREEArrayBuilder::::new(); + let mut builder = GenericByteRunBuilder::::new(); builder.append_value(values[0]).unwrap(); builder.append_value(values[0]).unwrap(); builder.append_value(values[0]).unwrap(); @@ -344,29 +440,29 @@ mod tests { } #[test] - fn test_string_ree_array_buider() { - test_bytes_ree_array_buider::(vec!["abc", "def"]); + fn test_string_run_buider() { + test_bytes_run_buider::(vec!["abc", "def"]); } #[test] - fn test_binary_ree_array_buider() { - test_bytes_ree_array_buider::(vec![b"abc", b"def"]); + fn test_binary_run_buider() { + test_bytes_run_buider::(vec![b"abc", b"def"]); } - fn test_bytes_ree_array_buider_finish_cloned(values: Vec<&T::Native>) + fn test_bytes_run_buider_finish_cloned(values: Vec<&T::Native>) where T: ByteArrayType, ::Native: PartialEq, ::Native: AsRef<::Native>, { - let mut builder = GenericByteREEArrayBuilder::::new(); + let mut builder = GenericByteRunBuilder::::new(); builder.append_value(values[0]).unwrap(); builder.append_null().unwrap(); builder.append_value(values[1]).unwrap(); builder.append_value(values[1]).unwrap(); builder.append_value(values[0]).unwrap(); - let mut array = builder.finish_cloned(); + let mut array: Int16RunArray = builder.finish_cloned(); assert_eq!( array.run_ends(), @@ -391,9 +487,7 @@ mod tests { assert_eq!( array.run_ends(), - &Int16Array::from( - vec![Some(1), Some(2), Some(4), Some(5), Some(7), Some(8),] - ) + &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(7), Some(8),]) ); // Values are polymorphic and so require a downcast. @@ -405,19 +499,16 @@ mod tests { assert!(ava2.is_null(1)); assert_eq!(ava2.value(2), values[1]); assert_eq!(ava2.value(3), values[0]); - assert_eq!(ava2.value(4), values[0]); - assert_eq!(ava2.value(5), values[1]); + assert_eq!(ava2.value(4), values[1]); } #[test] - fn test_string_ree_array_buider_finish_cloned() { - test_bytes_ree_array_buider_finish_cloned::(vec!["abc", "def", "ghi"]); + fn test_string_run_buider_finish_cloned() { + test_bytes_run_buider_finish_cloned::(vec!["abc", "def", "ghi"]); } #[test] - fn test_binary_ree_array_buider_finish_cloned() { - test_bytes_ree_array_buider_finish_cloned::(vec![ - b"abc", b"def", b"ghi", - ]); + fn test_binary_run_buider_finish_cloned() { + test_bytes_run_buider_finish_cloned::(vec![b"abc", b"def", b"ghi"]); } } diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index d4718cf1e443..73600d9e0a38 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -99,15 +99,6 @@ impl GenericByteBuilder { self.offsets_builder.append(self.next_offset()); } - /// Appends a byte array slice into the builder. - #[inline] - pub fn append_slice(&mut self, value: &[u8]) { - self.value_builder.append_slice(value); - self.null_buffer_builder.append(true); - self.offsets_builder - .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); - } - /// Append an `Option` value into the builder. #[inline] pub fn append_option(&mut self, value: Option>) { diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index f08676bd0bdd..fc2454635d99 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -39,14 +39,14 @@ mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; pub use primitive_dictionary_builder::*; -mod primitive_ree_array_builder; -pub use primitive_ree_array_builder::*; +mod primitive_run_builder; +pub use primitive_run_builder::*; mod struct_builder; pub use struct_builder::*; mod generic_bytes_dictionary_builder; pub use generic_bytes_dictionary_builder::*; -mod generic_byte_ree_array_builder; -pub use generic_byte_ree_array_builder::*; +mod generic_byte_run_builder; +pub use generic_byte_run_builder::*; mod union_builder; pub use union_builder::*; diff --git a/arrow-array/src/builder/primitive_ree_array_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs similarity index 57% rename from arrow-array/src/builder/primitive_ree_array_builder.rs rename to arrow-array/src/builder/primitive_run_builder.rs index 8660b99f97e9..a60f706f5e6f 100644 --- a/arrow-array/src/builder/primitive_ree_array_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -15,25 +15,27 @@ // specific language governing permissions and limitations // under the License. -use crate::{types::RunEndIndexType, ArrowPrimitiveType, RunEndEncodedArray}; +use std::{any::Any, sync::Arc}; -use super::PrimitiveBuilder; +use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray}; + +use super::{ArrayBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; -/// Array builder for [`RunEndEncodedArray`] that encodes primitive values. +/// Array builder for [`RunArray`] that encodes primitive values. /// /// # Example: /// /// ``` /// -/// # use arrow_array::builder::PrimitiveREEArrayBuilder; +/// # use arrow_array::builder::PrimitiveRunBuilder; /// # use arrow_array::types::{UInt32Type, Int16Type}; /// # use arrow_array::{Array, UInt32Array, Int16Array}; /// /// let mut builder = -/// PrimitiveREEArrayBuilder::::new(); +/// PrimitiveRunBuilder::::new(); /// builder.append_value(1234).unwrap(); /// builder.append_value(1234).unwrap(); /// builder.append_value(1234).unwrap(); @@ -59,7 +61,7 @@ use arrow_schema::ArrowError; /// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); /// ``` #[derive(Debug)] -pub struct PrimitiveREEArrayBuilder +pub struct PrimitiveRunBuilder where R: RunEndIndexType, V: ArrowPrimitiveType, @@ -68,9 +70,10 @@ where values_builder: PrimitiveBuilder, current_value: Option, current_run_end_index: usize, + prev_run_end_index: usize, } -impl Default for PrimitiveREEArrayBuilder +impl Default for PrimitiveRunBuilder where R: RunEndIndexType, V: ArrowPrimitiveType, @@ -80,22 +83,23 @@ where } } -impl PrimitiveREEArrayBuilder +impl PrimitiveRunBuilder where R: RunEndIndexType, V: ArrowPrimitiveType, { - /// Creates a new `PrimitiveREEArrayBuilder` + /// Creates a new `PrimitiveRunBuilder` pub fn new() -> Self { Self { run_ends_builder: PrimitiveBuilder::new(), values_builder: PrimitiveBuilder::new(), current_value: None, current_run_end_index: 0, + prev_run_end_index: 0, } } - /// Creates a new `PrimitiveREEArrayBuilder` with the provided capacity + /// Creates a new `PrimitiveRunBuilder` with the provided capacity /// /// `capacity`: the expected number of run-end encoded values. pub fn with_capacity(capacity: usize) -> Self { @@ -104,16 +108,63 @@ where values_builder: PrimitiveBuilder::with_capacity(capacity), current_value: None, current_run_end_index: 0, + prev_run_end_index: 0, } } } -impl PrimitiveREEArrayBuilder +impl ArrayBuilder for PrimitiveRunBuilder where R: RunEndIndexType, V: ArrowPrimitiveType, { - /// Appends optional value to the logical array encoded by the RunEndEncodedArray. + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + let mut len = self.run_ends_builder.len(); + // If there is an ongoing run yet to be added, include it in the len + if self.prev_run_end_index != self.current_run_end_index { + len += 1; + } + len + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.current_run_end_index == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Appends optional value to the logical array encoded by the RunArray. pub fn append_option(&mut self, value: Option) -> Result<(), ArrowError> { if self.current_run_end_index == 0 { self.current_run_end_index = 1; @@ -143,9 +194,9 @@ where self.append_option(None) } - /// Creates the RunEndEncodedArray and resets the builder. - /// Panics if RunEndEncodedArray cannot be built. - pub fn finish(&mut self) -> RunEndEncodedArray { + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { // write the last run end to the array. self.append_run_end().unwrap(); @@ -156,23 +207,38 @@ where // build the run encoded array by adding run_ends and values array as its children. let run_ends_array = self.run_ends_builder.finish(); let values_array = self.values_builder.finish(); - RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + RunArray::::try_new(&run_ends_array, &values_array).unwrap() } - /// Creates the RunEndEncodedArray and without resetting the builder. - /// Panics if RunEndEncodedArray cannot be built. - pub fn finish_cloned(&mut self) -> RunEndEncodedArray { - // write the last run end to the array. - self.append_run_end().unwrap(); + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); - // build the run encoded array by adding run_ends and values array as its children. - let run_ends_array = self.run_ends_builder.finish_cloned(); - let values_array = self.values_builder.finish_cloned(); - RunEndEncodedArray::::try_new(&run_ends_array, &values_array).unwrap() + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder) + .unwrap(); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } + + RunArray::try_new(&run_ends_array, &values_array).unwrap() } - // Appends the current run to the array + // Appends the current run to the array. There are scenarios where this function can be called + // multiple times before getting a new value. e.g. appending different value immediately following + // finish_cloned. fn append_run_end(&mut self) -> Result<(), ArrowError> { + // empty array or the function called without appending any value. + if self.current_run_end_index == 0 + || self.prev_run_end_index == self.current_run_end_index + { + return Ok(()); + } let run_end_index = R::Native::from_usize(self.current_run_end_index) .ok_or_else(|| { ArrowError::ParseError(format!( @@ -183,18 +249,38 @@ where })?; self.run_ends_builder.append_value(run_end_index); self.values_builder.append_option(self.current_value); + self.prev_run_end_index = self.current_run_end_index; + Ok(()) + } + + // Similar to `append_run_end` but on custom builders. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut PrimitiveBuilder, + ) -> Result<(), ArrowError> { + let run_end_index = R::Native::from_usize(self.current_run_end_index) + .ok_or_else(|| { + ArrowError::ParseError(format!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + })?; + run_ends_builder.append_value(run_end_index); + values_builder.append_option(self.current_value); Ok(()) } } #[cfg(test)] mod tests { - use crate::builder::PrimitiveREEArrayBuilder; + use crate::builder::PrimitiveRunBuilder; use crate::types::{Int16Type, UInt32Type}; use crate::{Int16Array, UInt32Array}; #[test] fn test_primitive_ree_array_builder() { - let mut builder = PrimitiveREEArrayBuilder::::new(); + let mut builder = PrimitiveRunBuilder::::new(); builder.append_value(1234).unwrap(); builder.append_value(1234).unwrap(); builder.append_value(1234).unwrap(); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index e9c1b8305346..43921cf672d3 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -253,7 +253,7 @@ mod run { } /// A subtype of primitive type that is used as run-ends index -/// in RunEndEncodedArray. +/// in [`RunArray`]. /// See /// /// Note: The implementation of this trait is sealed to avoid accidental misuse. diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 9a54e9ae9a69..7c37587d749b 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -865,9 +865,9 @@ impl ArrayData { "The nullable should be set to false for the field defining run_ends array.".to_string() )); } - if !DataType::is_ree_run_ends_type(run_ends_type.data_type()) { + if !DataType::is_run_ends_type(run_ends_type.data_type()) { return Err(ArrowError::InvalidArgumentError(format!( - "RunEndEncodedArray run_ends types must be Int16, Int32 or Int64, but was {}", + "RunArray run_ends types must be Int16, Int32 or Int64, but was {}", run_ends_type.data_type() ))); } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 765f7f8e7874..1e5c1321c952 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -358,9 +358,9 @@ impl DataType { ) } - /// Returns true if this type is valid for run-ends array in RunEndEncodedArray + /// Returns true if this type is valid for run-ends array in RunArray #[inline] - pub fn is_ree_run_ends_type(&self) -> bool { + pub fn is_run_ends_type(&self) -> bool { use DataType::*; matches!(self, Int16 | Int32 | Int64) } From 6ee3c5e1b12c90174b871ad48fbe5086327338da Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 06:33:13 -0500 Subject: [PATCH 10/20] run array name change --- arrow-array/src/array/mod.rs | 4 ++-- .../src/array/{run_end_encoded_array.rs => run_array.rs} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename arrow-array/src/array/{run_end_encoded_array.rs => run_array.rs} (100%) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index df74fcf128b0..69f6ba4d8de1 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -64,8 +64,8 @@ pub use struct_array::*; mod union_array; pub use union_array::*; -mod run_end_encoded_array; -pub use run_end_encoded_array::*; +mod run_array; +pub use run_array::*; /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. diff --git a/arrow-array/src/array/run_end_encoded_array.rs b/arrow-array/src/array/run_array.rs similarity index 100% rename from arrow-array/src/array/run_end_encoded_array.rs rename to arrow-array/src/array/run_array.rs From 00805165d73764a829848d9b74541f3f66d2649b Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 07:23:21 -0500 Subject: [PATCH 11/20] fix doc issues --- arrow-array/src/builder/primitive_run_builder.rs | 6 ++++-- arrow-array/src/types.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index a60f706f5e6f..caf19e2506c1 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -31,6 +31,7 @@ use arrow_schema::ArrowError; /// ``` /// /// # use arrow_array::builder::PrimitiveRunBuilder; +/// # use arrow_array::cast::as_primitive_array; /// # use arrow_array::types::{UInt32Type, Int16Type}; /// # use arrow_array::{Array, UInt32Array, Int16Array}; /// @@ -56,7 +57,7 @@ use arrow_schema::ArrowError; /// assert!(!av.is_null(2)); /// /// // Values are polymorphic and so require a downcast. -/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); +/// let ava: &UInt32Array = as_primitive_array::(av.as_ref()); /// /// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); /// ``` @@ -276,6 +277,7 @@ where #[cfg(test)] mod tests { use crate::builder::PrimitiveRunBuilder; + use crate::cast::as_primitive_array; use crate::types::{Int16Type, UInt32Type}; use crate::{Int16Array, UInt32Array}; #[test] @@ -301,7 +303,7 @@ mod tests { assert!(!av.is_null(2)); // Values are polymorphic and so require a downcast. - let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); + let ava: &UInt32Array = as_primitive_array::(av.as_ref()); assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 43921cf672d3..fc02c0e5a3dc 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -253,7 +253,7 @@ mod run { } /// A subtype of primitive type that is used as run-ends index -/// in [`RunArray`]. +/// in `RunArray`. /// See /// /// Note: The implementation of this trait is sealed to avoid accidental misuse. From ae14c6cced4596b38bb30efc5d45afa3804bdb21 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 07:39:00 -0500 Subject: [PATCH 12/20] doc change --- arrow-array/src/array/run_array.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 9c7811348771..0ceb9b48ad20 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -96,6 +96,9 @@ impl RunArray { Ok(array_data.into()) } /// Returns a reference to run_ends array + /// + /// Note: any slicing of this array is not applied to the returned array + /// and must be handled separately pub fn run_ends(&self) -> &PrimitiveArray { &self.run_ends } From 3ab0dac4e3924940ab90f45a4ba347f1e7fb0598 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 08:23:30 -0500 Subject: [PATCH 13/20] lint fix --- arrow-array/src/array/run_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 0ceb9b48ad20..d1b6dbe3dd95 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -96,7 +96,7 @@ impl RunArray { Ok(array_data.into()) } /// Returns a reference to run_ends array - /// + /// /// Note: any slicing of this array is not applied to the returned array /// and must be handled separately pub fn run_ends(&self) -> &PrimitiveArray { From 2b6479504a3ff7492818e33aed5f5415865d5954 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 09:29:49 -0500 Subject: [PATCH 14/20] make append methods infallible --- arrow-array/src/array/run_array.rs | 20 +-- .../src/builder/generic_byte_run_builder.rs | 127 ++++++++---------- .../src/builder/primitive_run_builder.rs | 75 +++++------ 3 files changed, 94 insertions(+), 128 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index d1b6dbe3dd95..b35363a81716 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -182,13 +182,9 @@ impl<'a, T: RunEndIndexType> FromIterator> for RunArray { let mut builder = StringRunBuilder::with_capacity(lower, 256); it.for_each(|i| { if let Some(i) = i { - builder - .append_value(i) - .expect("Unable to append a value to a run end encoded array."); + builder.append_value(i); } else { - builder - .append_null() - .expect("Unable to append null value to run end encoded array."); + builder.append_null(); } }); @@ -216,9 +212,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { let (lower, _) = it.size_hint(); let mut builder = StringRunBuilder::with_capacity(lower, 256); it.for_each(|i| { - builder - .append_value(i) - .expect("Unable to append a value to a dictionary array."); + builder.append_value(i); }); builder.finish() @@ -318,9 +312,9 @@ mod tests { #[test] fn test_run_array_fmt_debug() { let mut builder = PrimitiveRunBuilder::::with_capacity(3); - builder.append_value(12345678).unwrap(); - builder.append_null().unwrap(); - builder.append_value(22345678).unwrap(); + builder.append_value(12345678); + builder.append_null(); + builder.append_value(22345678); let array = builder.finish(); assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", @@ -329,7 +323,7 @@ mod tests { let mut builder = PrimitiveRunBuilder::::with_capacity(20); for _ in 0..20 { - builder.append_value(1).unwrap(); + builder.append_value(1); } let array = builder.finish(); assert_eq!( diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index c1b6c69de026..829835d54210 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -29,7 +29,6 @@ use crate::{ use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -use arrow_schema::ArrowError; /// Array builder for [`RunArray`] for String and Binary types. /// @@ -44,10 +43,10 @@ use arrow_schema::ArrowError; /// /// let mut builder = /// GenericByteRunBuilder::::new(); -/// builder.append_value(b"abc").unwrap(); -/// builder.append_value(b"abc").unwrap(); -/// builder.append_null().unwrap(); -/// builder.append_value(b"def").unwrap(); +/// builder.append_value(b"abc"); +/// builder.append_value(b"abc"); +/// builder.append_null(); +/// builder.append_value(b"def"); /// let array = builder.finish(); /// /// assert_eq!( @@ -179,58 +178,43 @@ where V: ByteArrayType, { /// Appends optional value to the logical array encoded by the RunArray. - pub fn append_option( - &mut self, - input_value: Option>, - ) -> Result<(), ArrowError> { + pub fn append_option(&mut self, input_value: Option>) { match input_value { - Some(value) => self.append_value(value)?, - None => self.append_null()?, + Some(value) => self.append_value(value), + None => self.append_null(), } - Ok(()) } /// Appends value to the logical array encoded by the RunArray. - pub fn append_value( - &mut self, - input_value: impl AsRef, - ) -> Result<(), ArrowError> { + pub fn append_value(&mut self, input_value: impl AsRef) { let value: &[u8] = input_value.as_ref().as_ref(); if !self.has_current_value { - self.append_run_end()?; + self.append_run_end(); self.current_value.extend_from_slice(value); self.has_current_value = true; } else if self.current_value.as_slice() != value { - self.append_run_end()?; + self.append_run_end(); self.current_value.clear(); self.current_value.extend_from_slice(value); } - self.current_run_end_index = self - .current_run_end_index - .checked_add(1) - .ok_or(ArrowError::RunEndIndexOverflowError)?; - Ok(()) + self.current_run_end_index += 1; } /// Appends null to the logical array encoded by the RunArray. - pub fn append_null(&mut self) -> Result<(), ArrowError> { + pub fn append_null(&mut self) { if self.has_current_value { - self.append_run_end()?; + self.append_run_end(); self.current_value.clear(); self.has_current_value = false; } - self.current_run_end_index = self - .current_run_end_index - .checked_add(1) - .ok_or(ArrowError::RunEndIndexOverflowError)?; - Ok(()) + self.current_run_end_index += 1; } /// Creates the RunArray and resets the builder. /// Panics if RunArray cannot be built. pub fn finish(&mut self) -> RunArray { // write the last run end to the array. - self.append_run_end().unwrap(); + self.append_run_end(); // reset the run end index to zero. self.current_value.clear(); @@ -254,8 +238,7 @@ where if self.prev_run_end_index != self.current_run_end_index { let mut run_end_builder = run_ends_array.into_builder().unwrap(); let mut values_builder = values_array.into_builder().unwrap(); - self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder) - .unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); run_ends_array = run_end_builder.finish(); values_array = values_builder.finish(); } @@ -264,31 +247,32 @@ where } // Appends the current run to the array. - fn append_run_end(&mut self) -> Result<(), ArrowError> { + fn append_run_end(&mut self) { // empty array or the function called without appending any value. if self.current_run_end_index == 0 || self.prev_run_end_index == self.current_run_end_index { - return Ok(()); + return; } let run_end_index = R::Native::from_usize(self.current_run_end_index) - .ok_or_else(|| { - ArrowError::ParseError(format!( + .unwrap_or_else(|| panic!( "Cannot convert the value {} from `usize` to native form of arrow datatype {}", self.current_run_end_index, R::DATA_TYPE - )) - })?; + )); self.run_ends_builder.append_value(run_end_index); if self.has_current_value { let slice = self.current_value.as_slice(); - let native = unsafe { V::Native::from_bytes_unchecked(slice) }; + let native = unsafe { + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; self.values_builder.append_value(native); } else { self.values_builder.append_null(); } self.prev_run_end_index = self.current_run_end_index; - Ok(()) } // Similar to `append_run_end` but on custom builders. @@ -296,24 +280,25 @@ where &self, run_ends_builder: &mut PrimitiveBuilder, values_builder: &mut GenericByteBuilder, - ) -> Result<(), ArrowError> { + ) { let run_end_index = R::Native::from_usize(self.current_run_end_index) - .ok_or_else(|| { - ArrowError::ParseError(format!( + .unwrap_or_else(|| panic!( "Cannot convert the value {} from `usize` to native form of arrow datatype {}", self.current_run_end_index, R::DATA_TYPE - )) - })?; + )); run_ends_builder.append_value(run_end_index); if self.has_current_value { let slice = self.current_value.as_slice(); - let native = unsafe { V::Native::from_bytes_unchecked(slice) }; + let native = unsafe { + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; values_builder.append_value(native); } else { values_builder.append_null(); } - Ok(()) } } @@ -330,11 +315,11 @@ where /// let mut builder = StringRunBuilder::::new(); /// /// // The builder builds the dictionary value by value -/// builder.append_value("abc").unwrap(); +/// builder.append_value("abc"); /// builder.append_null(); -/// builder.append_value("def").unwrap(); -/// builder.append_value("def").unwrap(); -/// builder.append_value("abc").unwrap(); +/// builder.append_value("def"); +/// builder.append_value("def"); +/// builder.append_value("abc"); /// let array = builder.finish(); /// /// assert_eq!( @@ -370,11 +355,11 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// let mut builder = BinaryRunBuilder::::new(); /// /// // The builder builds the dictionary value by value -/// builder.append_value(b"abc").unwrap(); +/// builder.append_value(b"abc"); /// builder.append_null(); -/// builder.append_value(b"def").unwrap(); -/// builder.append_value(b"def").unwrap(); -/// builder.append_value(b"abc").unwrap(); +/// builder.append_value(b"def"); +/// builder.append_value(b"def"); +/// builder.append_value(b"abc"); /// let array = builder.finish(); /// /// assert_eq!( @@ -415,13 +400,13 @@ mod tests { ::Native: AsRef<::Native>, { let mut builder = GenericByteRunBuilder::::new(); - builder.append_value(values[0]).unwrap(); - builder.append_value(values[0]).unwrap(); - builder.append_value(values[0]).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(values[1]).unwrap(); - builder.append_value(values[1]).unwrap(); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_null(); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); let array = builder.finish(); assert_eq!( @@ -457,11 +442,11 @@ mod tests { { let mut builder = GenericByteRunBuilder::::new(); - builder.append_value(values[0]).unwrap(); - builder.append_null().unwrap(); - builder.append_value(values[1]).unwrap(); - builder.append_value(values[1]).unwrap(); - builder.append_value(values[0]).unwrap(); + builder.append_value(values[0]); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); + builder.append_value(values[0]); let mut array: Int16RunArray = builder.finish_cloned(); assert_eq!( @@ -479,9 +464,9 @@ mod tests { assert_eq!(ava.value(2), values[1]); assert_eq!(ava.value(3), values[0]); - builder.append_value(values[0]).unwrap(); - builder.append_value(values[0]).unwrap(); - builder.append_value(values[1]).unwrap(); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[1]); array = builder.finish(); diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index caf19e2506c1..5ca061c3cb87 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -22,7 +22,6 @@ use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray}; use super::{ArrayBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -use arrow_schema::ArrowError; /// Array builder for [`RunArray`] that encodes primitive values. /// @@ -37,12 +36,12 @@ use arrow_schema::ArrowError; /// /// let mut builder = /// PrimitiveRunBuilder::::new(); -/// builder.append_value(1234).unwrap(); -/// builder.append_value(1234).unwrap(); -/// builder.append_value(1234).unwrap(); -/// builder.append_null().unwrap(); -/// builder.append_value(5678).unwrap(); -/// builder.append_value(5678).unwrap(); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_null(); +/// builder.append_value(5678); +/// builder.append_value(5678); /// let array = builder.finish(); /// /// assert_eq!( @@ -166,32 +165,27 @@ where V: ArrowPrimitiveType, { /// Appends optional value to the logical array encoded by the RunArray. - pub fn append_option(&mut self, value: Option) -> Result<(), ArrowError> { + pub fn append_option(&mut self, value: Option) { if self.current_run_end_index == 0 { self.current_run_end_index = 1; self.current_value = value; - return Ok(()); + return; } if self.current_value != value { - self.append_run_end()?; + self.append_run_end(); self.current_value = value; } - self.current_run_end_index = self - .current_run_end_index - .checked_add(1) - .ok_or(ArrowError::RunEndIndexOverflowError)?; - - Ok(()) + self.current_run_end_index += 1; } /// Appends value to the logical array encoded by the run-ends array. - pub fn append_value(&mut self, value: V::Native) -> Result<(), ArrowError> { + pub fn append_value(&mut self, value: V::Native) { self.append_option(Some(value)) } /// Appends null to the logical array encoded by the run-ends array. - pub fn append_null(&mut self) -> Result<(), ArrowError> { + pub fn append_null(&mut self) { self.append_option(None) } @@ -199,7 +193,7 @@ where /// Panics if RunArray cannot be built. pub fn finish(&mut self) -> RunArray { // write the last run end to the array. - self.append_run_end().unwrap(); + self.append_run_end(); // reset the run index to zero. self.current_value = None; @@ -221,8 +215,7 @@ where if self.prev_run_end_index != self.current_run_end_index { let mut run_end_builder = run_ends_array.into_builder().unwrap(); let mut values_builder = values_array.into_builder().unwrap(); - self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder) - .unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); run_ends_array = run_end_builder.finish(); values_array = values_builder.finish(); } @@ -233,25 +226,22 @@ where // Appends the current run to the array. There are scenarios where this function can be called // multiple times before getting a new value. e.g. appending different value immediately following // finish_cloned. - fn append_run_end(&mut self) -> Result<(), ArrowError> { + fn append_run_end(&mut self) { // empty array or the function called without appending any value. if self.current_run_end_index == 0 || self.prev_run_end_index == self.current_run_end_index { - return Ok(()); + return; } - let run_end_index = R::Native::from_usize(self.current_run_end_index) - .ok_or_else(|| { - ArrowError::ParseError(format!( - "Cannot convert the value {} from `usize` to native form of arrow datatype {}", - self.current_run_end_index, - R::DATA_TYPE - )) - })?; + let run_end_index = R::Native::from_usize(self.current_run_end_index) + .unwrap_or_else(|| panic!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )); self.run_ends_builder.append_value(run_end_index); self.values_builder.append_option(self.current_value); self.prev_run_end_index = self.current_run_end_index; - Ok(()) } // Similar to `append_run_end` but on custom builders. @@ -259,18 +249,15 @@ where &self, run_ends_builder: &mut PrimitiveBuilder, values_builder: &mut PrimitiveBuilder, - ) -> Result<(), ArrowError> { + ) { let run_end_index = R::Native::from_usize(self.current_run_end_index) - .ok_or_else(|| { - ArrowError::ParseError(format!( + .unwrap_or_else(|| panic!( "Cannot convert the value {} from `usize` to native form of arrow datatype {}", self.current_run_end_index, R::DATA_TYPE - )) - })?; + )); run_ends_builder.append_value(run_end_index); values_builder.append_option(self.current_value); - Ok(()) } } @@ -283,12 +270,12 @@ mod tests { #[test] fn test_primitive_ree_array_builder() { let mut builder = PrimitiveRunBuilder::::new(); - builder.append_value(1234).unwrap(); - builder.append_value(1234).unwrap(); - builder.append_value(1234).unwrap(); - builder.append_null().unwrap(); - builder.append_value(5678).unwrap(); - builder.append_value(5678).unwrap(); + builder.append_value(1234); + builder.append_value(1234); + builder.append_value(1234); + builder.append_null(); + builder.append_value(5678); + builder.append_value(5678); let array = builder.finish(); assert_eq!( From 467b569b9b2bcb8ec56658b549192f43d467b2e5 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 14:29:53 -0500 Subject: [PATCH 15/20] fix array.len and other minor changes --- arrow-array/src/array/run_array.rs | 84 ++++++++++--------- .../src/builder/generic_byte_run_builder.rs | 65 +++++++++++--- .../src/builder/primitive_run_builder.rs | 60 ++++++++----- 3 files changed, 135 insertions(+), 74 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index b35363a81716..e87c50d4c37a 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -17,6 +17,7 @@ use std::any::Any; +use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -65,6 +66,17 @@ pub struct RunArray { } impl RunArray { + + // calculates the logical length of the array encoded + // by the given run_ends array. + fn logical_len(run_ends: &PrimitiveArray) -> usize { + let len = run_ends.len(); + if len == 0 { + return 0; + } + run_ends.value(len - 1).as_usize() + } + /// Attempts to create RunArray using given run_ends (index where a run ends) /// and the values (value of the run). Returns an error if the given data is not compatible /// with RunEndEncoded specification. @@ -78,7 +90,9 @@ impl RunArray { Box::new(Field::new("run_ends", run_ends_type, false)), Box::new(Field::new("values", values_type, true)), ); + let len = RunArray::logical_len(run_ends); let builder = ArrayDataBuilder::new(ree_array_type) + .len(len) .add_child_data(run_ends.data().clone()) .add_child_data(values.data().clone()); @@ -181,11 +195,7 @@ impl<'a, T: RunEndIndexType> FromIterator> for RunArray { let (lower, _) = it.size_hint(); let mut builder = StringRunBuilder::with_capacity(lower, 256); it.for_each(|i| { - if let Some(i) = i { - builder.append_value(i); - } else { - builder.append_null(); - } + builder.append_option(i); }); builder.finish() @@ -272,40 +282,32 @@ mod tests { use crate::builder::PrimitiveRunBuilder; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int16Array, Int32Array, StringArray}; - use arrow_schema::Field; #[test] fn test_run_array() { // Construct a value array let value_data = PrimitiveArray::::from_iter_values([ 10_i8, 11, 12, 13, 14, 15, 16, 17, - ]) - .into_data(); + ]); // Construct a run_ends array: let run_ends_data = PrimitiveArray::::from_iter_values([ 4_i16, 6, 7, 9, 13, 18, 20, 22, - ]) - .into_data(); + ]); // Construct a run ends encoded array from the above two - let run_ends_type = Field::new("run_ends", DataType::Int16, false); - let value_type = Field::new("values", DataType::Int8, true); - let ree_array_type = - DataType::RunEndEncoded(Box::new(run_ends_type), Box::new(value_type)); - let dict_data = ArrayData::builder(ree_array_type) - .add_child_data(run_ends_data.clone()) - .add_child_data(value_data.clone()) - .build() - .unwrap(); - let ree_array = Int16RunArray::from(dict_data); + let ree_array = + RunArray::::try_new(&run_ends_data, &value_data).unwrap(); + + assert_eq!(ree_array.len(), 22); + assert_eq!(ree_array.null_count(), 0); let values = ree_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(&value_data.into_data(), values.data()); assert_eq!(&DataType::Int8, values.data_type()); let run_ends = ree_array.run_ends(); - assert_eq!(&run_ends_data, run_ends.data()); + assert_eq!(&run_ends_data.into_data(), run_ends.data()); assert_eq!(&DataType::Int16, run_ends.data_type()); } @@ -326,6 +328,10 @@ mod tests { builder.append_value(1); } let array = builder.finish(); + + assert_eq!(array.len(), 20); + assert_eq!(array.null_count(), 0); + assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", format!("{:?}", array) @@ -344,6 +350,9 @@ mod tests { format!("{:?}", array) ); + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + let array: RunArray = test.into_iter().collect(); assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", @@ -356,6 +365,9 @@ mod tests { let test = vec!["a", "b", "c", "a"]; let array: RunArray = test.into_iter().collect(); + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + let run_ends = array.run_ends(); assert_eq!(&DataType::Int16, run_ends.data_type()); assert_eq!(0, run_ends.null_count()); @@ -367,6 +379,9 @@ mod tests { let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; let array: RunArray = test.into_iter().collect(); + assert_eq!(array.len(), 6); + assert_eq!(array.null_count(), 0); + let run_ends = array.run_ends(); assert_eq!(&DataType::Int32, run_ends.data_type()); assert_eq!(0, run_ends.null_count()); @@ -383,6 +398,9 @@ mod tests { let test = vec![None, None, None]; let array: RunArray = test.into_iter().collect(); + assert_eq!(array.len(), 3); + assert_eq!(array.null_count(), 0); + let run_ends = array.run_ends(); assert_eq!(1, run_ends.len()); assert_eq!(&[3], run_ends.values()); @@ -403,6 +421,8 @@ mod tests { assert_eq!(array.run_ends().data_type(), &DataType::Int32); assert_eq!(array.values().data_type(), &DataType::Utf8); + assert_eq!(array.null_count(), 0); + assert_eq!(array.len(), 4); assert_eq!(array.run_ends.null_count(), 0); assert_eq!(array.values().null_count(), 1); @@ -437,10 +457,7 @@ mod tests { let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string()); - assert_eq!( - format!("{}", expected), - format!("{}", actual.err().unwrap()) - ); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } #[test] @@ -452,10 +469,7 @@ mod tests { let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); - assert_eq!( - format!("{}", expected), - format!("{}", actual.err().unwrap()) - ); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } #[test] @@ -467,10 +481,7 @@ mod tests { let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string()); - assert_eq!( - format!("{}", expected), - format!("{}", actual.err().unwrap()) - ); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } #[test] @@ -482,10 +493,7 @@ mod tests { let actual = RunArray::::try_new(&run_ends, &values); let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string()); - assert_eq!( - format!("{}", expected), - format!("{}", actual.err().unwrap()) - ); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } #[test] diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 829835d54210..5cb5dfd0207f 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -124,6 +124,17 @@ where prev_run_end_index: 0, } } + + /// Returns the physical length of the encoded array + pub fn physical_len(&self) -> usize { + let mut len = self.run_ends_builder.len(); + + // If there is an ongoing run yet to be added, include it in the len + if self.prev_run_end_index != self.current_run_end_index { + len += 1; + } + len + } } impl ArrayBuilder for GenericByteRunBuilder @@ -146,14 +157,10 @@ where self } - /// Returns the number of array slots in the builder + /// Returns the length of logical array encoded by + /// the eventual runs array. fn len(&self) -> usize { - let mut len = self.run_ends_builder.len(); - // If there is an ongoing run yet to be added, include it in the len - if self.prev_run_end_index != self.current_run_end_index { - len += 1; - } - len + self.current_run_end_index } /// Returns whether the number of array slots is zero @@ -249,9 +256,7 @@ where // Appends the current run to the array. fn append_run_end(&mut self) { // empty array or the function called without appending any value. - if self.current_run_end_index == 0 - || self.prev_run_end_index == self.current_run_end_index - { + if self.prev_run_end_index == self.current_run_end_index { return; } let run_end_index = R::Native::from_usize(self.current_run_end_index) @@ -264,6 +269,7 @@ where if self.has_current_value { let slice = self.current_value.as_slice(); let native = unsafe { + // Safety: // As self.current_value is created from V::Native. The value V::Native can be // built back from the bytes without validations V::Native::from_bytes_unchecked(slice) @@ -291,6 +297,7 @@ where if self.has_current_value { let slice = self.current_value.as_slice(); let native = unsafe { + // Safety: // As self.current_value is created from V::Native. The value V::Native can be // built back from the bytes without validations V::Native::from_bytes_unchecked(slice) @@ -407,11 +414,18 @@ mod tests { builder.append_null(); builder.append_value(values[1]); builder.append_value(values[1]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); let array = builder.finish(); + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + assert_eq!( array.run_ends(), - &Int16Array::from(vec![Some(3), Some(5), Some(7)]) + &Int16Array::from(vec![Some(3), Some(5), Some(7), Some(11)]) ); // Values are polymorphic and so require a downcast. @@ -422,16 +436,22 @@ mod tests { assert_eq!(*ava.value(0), *values[0]); assert!(ava.is_null(1)); assert_eq!(*ava.value(2), *values[1]); + assert_eq!(*ava.value(3), *values[2]); } #[test] fn test_string_run_buider() { - test_bytes_run_buider::(vec!["abc", "def"]); + test_bytes_run_buider::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_string_run_buider_with_empty_strings() { + test_bytes_run_buider::(vec!["abc", "", "ghi"]); } #[test] fn test_binary_run_buider() { - test_bytes_run_buider::(vec![b"abc", b"def"]); + test_bytes_run_buider::(vec![b"abc", b"def", b"ghi"]); } fn test_bytes_run_buider_finish_cloned(values: Vec<&T::Native>) @@ -444,11 +464,20 @@ mod tests { builder.append_value(values[0]); builder.append_null(); + + assert_eq!(builder.physical_len(), 2); + builder.append_value(values[1]); + + assert_eq!(builder.physical_len(), 3); + builder.append_value(values[1]); builder.append_value(values[0]); let mut array: Int16RunArray = builder.finish_cloned(); + assert_eq!(array.len(), 5); + assert_eq!(array.null_count(), 0); + assert_eq!( array.run_ends(), &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) @@ -464,12 +493,21 @@ mod tests { assert_eq!(ava.value(2), values[1]); assert_eq!(ava.value(3), values[0]); + // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only + // one entry in final output. builder.append_value(values[0]); + + assert_eq!(builder.physical_len(), 4); + builder.append_value(values[0]); builder.append_value(values[1]); + assert_eq!(builder.physical_len(), 5); array = builder.finish(); + assert_eq!(array.len(), 8); + assert_eq!(array.null_count(), 0); + assert_eq!( array.run_ends(), &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(7), Some(8),]) @@ -483,6 +521,7 @@ mod tests { assert_eq!(ava2.value(0), values[0]); assert!(ava2.is_null(1)); assert_eq!(ava2.value(2), values[1]); + // The value appended before and after `finish_cloned` has only one entry. assert_eq!(ava2.value(3), values[0]); assert_eq!(ava2.value(4), values[1]); } diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 5ca061c3cb87..11ce238f4e7f 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -111,6 +111,17 @@ where prev_run_end_index: 0, } } + + /// Returns the physical length of the encoded array + pub fn physical_len(&self) -> usize { + let mut len = self.run_ends_builder.len(); + + // If there is an ongoing run yet to be added, include it in the len + if self.prev_run_end_index != self.current_run_end_index { + len += 1; + } + len + } } impl ArrayBuilder for PrimitiveRunBuilder @@ -133,14 +144,10 @@ where self } - /// Returns the number of array slots in the builder + /// Returns the length of logical array encoded by + /// the eventual runs array. fn len(&self) -> usize { - let mut len = self.run_ends_builder.len(); - // If there is an ongoing run yet to be added, include it in the len - if self.prev_run_end_index != self.current_run_end_index { - len += 1; - } - len + self.current_run_end_index } /// Returns whether the number of array slots is zero @@ -228,17 +235,10 @@ where // finish_cloned. fn append_run_end(&mut self) { // empty array or the function called without appending any value. - if self.current_run_end_index == 0 - || self.prev_run_end_index == self.current_run_end_index - { + if self.prev_run_end_index == self.current_run_end_index { return; } - let run_end_index = R::Native::from_usize(self.current_run_end_index) - .unwrap_or_else(|| panic!( - "Cannot convert the value {} from `usize` to native form of arrow datatype {}", - self.current_run_end_index, - R::DATA_TYPE - )); + let run_end_index = self.run_end_index_as_native(); self.run_ends_builder.append_value(run_end_index); self.values_builder.append_option(self.current_value); self.prev_run_end_index = self.current_run_end_index; @@ -250,15 +250,19 @@ where run_ends_builder: &mut PrimitiveBuilder, values_builder: &mut PrimitiveBuilder, ) { - let run_end_index = R::Native::from_usize(self.current_run_end_index) - .unwrap_or_else(|| panic!( - "Cannot convert the value {} from `usize` to native form of arrow datatype {}", - self.current_run_end_index, - R::DATA_TYPE - )); + let run_end_index = self.run_end_index_as_native(); run_ends_builder.append_value(run_end_index); values_builder.append_option(self.current_value); } + + fn run_end_index_as_native(&self) -> R::Native { + R::Native::from_usize(self.current_run_end_index) + .unwrap_or_else(|| panic!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + } } #[cfg(test)] @@ -266,18 +270,28 @@ mod tests { use crate::builder::PrimitiveRunBuilder; use crate::cast::as_primitive_array; use crate::types::{Int16Type, UInt32Type}; - use crate::{Int16Array, UInt32Array}; + use crate::{Array, Int16Array, UInt32Array}; #[test] fn test_primitive_ree_array_builder() { let mut builder = PrimitiveRunBuilder::::new(); builder.append_value(1234); builder.append_value(1234); builder.append_value(1234); + + assert_eq!(builder.physical_len(), 1); + builder.append_null(); + assert_eq!(builder.physical_len(), 2); + builder.append_value(5678); builder.append_value(5678); + assert_eq!(builder.physical_len(), 3); + let array = builder.finish(); + assert_eq!(array.null_count(), 0); + assert_eq!(array.len(), 6); + assert_eq!( array.run_ends(), &Int16Array::from(vec![Some(3), Some(4), Some(6)]) From 9ee4f88ac2dddc70b1e9cd2a4a16b25127edf7a6 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 14:33:57 -0500 Subject: [PATCH 16/20] formatting fix --- arrow-array/src/array/run_array.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index e87c50d4c37a..148c6ca728a8 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -66,7 +66,6 @@ pub struct RunArray { } impl RunArray { - // calculates the logical length of the array encoded // by the given run_ends array. fn logical_len(run_ends: &PrimitiveArray) -> usize { From e555a913e7e372f99082ab0fbad9290bb05882f8 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 14:48:50 -0500 Subject: [PATCH 17/20] add validation of array len --- arrow-data/src/data.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 7c37587d749b..4994b234d663 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1327,9 +1327,9 @@ impl ArrayData { DataType::RunEndEncoded(run_ends, _values) => { let run_ends_data = self.child_data()[0].clone(); match run_ends.data_type() { - DataType::Int16 => run_ends_data.check_run_ends::(), - DataType::Int32 => run_ends_data.check_run_ends::(), - DataType::Int64 => run_ends_data.check_run_ends::(), + DataType::Int16 => run_ends_data.check_run_ends::(self.len()), + DataType::Int32 => run_ends_data.check_run_ends::(self.len()), + DataType::Int64 => run_ends_data.check_run_ends::(self.len()), _ => unreachable!(), } } @@ -1494,7 +1494,7 @@ impl ArrayData { } /// Validates that each value in run_ends array is positive and strictly increasing. - fn check_run_ends(&self) -> Result<(), ArrowError> + fn check_run_ends(&self, array_len: usize) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1525,8 +1525,18 @@ impl ArrayData { prev_value = value; Ok(()) - }) + })?; + + if prev_value.as_usize() != array_len { + return Err(ArrowError::InvalidArgumentError(format!( + "The length of array does not match the last value in the run_ends array. The last value of run_ends array is {} and length of array is {}.", + prev_value, + array_len + ))); + } + Ok(()) } + /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may /// return false when the arrays are logically equal From 0b4784230cadb0be97a426293da587f45fcbbca5 Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 14:51:15 -0500 Subject: [PATCH 18/20] fmt fix --- arrow-data/src/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 4994b234d663..07bbc664234a 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1536,7 +1536,7 @@ impl ArrayData { } Ok(()) } - + /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may /// return false when the arrays are logically equal From a37bcff269dab5b5dc0de443211fe8592c5f5caa Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 11:39:56 -0500 Subject: [PATCH 19/20] Add Extend, ArrayAccesor, TypedRunArray, RunIter and benches for string run builder. --- arrow-array/src/array/run_array.rs | 168 ++++++++++++- .../src/builder/generic_byte_run_builder.rs | 53 +++- .../src/builder/primitive_run_builder.rs | 31 +++ arrow-array/src/lib.rs | 1 + arrow-array/src/run_iterator.rs | 238 ++++++++++++++++++ arrow/Cargo.toml | 4 + arrow/benches/string_run_builder.rs | 80 ++++++ 7 files changed, 563 insertions(+), 12 deletions(-) create mode 100644 arrow-array/src/run_iterator.rs create mode 100644 arrow/benches/string_run_builder.rs diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 148c6ca728a8..05b5660ba906 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -24,8 +24,9 @@ use arrow_schema::{ArrowError, DataType, Field}; use crate::{ builder::StringRunBuilder, make_array, + run_iterator::RunArrayIter, types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, - Array, ArrayRef, PrimitiveArray, + Array, ArrayAccessor, ArrayRef, PrimitiveArray, }; /// @@ -120,6 +121,27 @@ impl RunArray { pub fn values(&self) -> &ArrayRef { &self.values } + + /// Downcast this dictionary to a [`TypedRunArray`] + /// + /// ``` + /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type}; + /// + /// let orig = [Some("a"), Some("b"), None]; + /// let run_array = RunArray::::from_iter(orig); + /// let typed = run_array.downcast_ref::().unwrap(); + /// assert_eq!(typed.value(0), "a"); + /// assert_eq!(typed.value(1), "b"); + /// assert!(typed.values().is_null(2)); + /// ``` + /// + pub fn downcast_ref(&self) -> Option> { + let values = self.values.as_any().downcast_ref()?; + Some(TypedRunArray { + run_array: self, + values, + }) + } } impl From for RunArray { @@ -273,6 +295,150 @@ pub type Int32RunArray = RunArray; /// ``` pub type Int64RunArray = RunArray; +/// The trait defines functions that helps access the run array +/// properties and values +pub trait RunArrayAccessor { + /// Length of the physical array in [`RunArray`] + fn physical_len(&self) -> usize; + + /// The logical index at which the `physical_index` run ends. + /// i.e. value at the index `physical_index` in run_ends array. + fn run_end_index(&self, physical_index: usize) -> Option; + + /// Returns true if the value is null in the `physical_index` + fn is_value_null(&self, physical_index: usize) -> bool; +} + +/// A strongly-typed wrapper around a [`RunArray`] that implements [`ArrayAccessor`] +/// and [`IntoIterator`] allowing fast access to its elements +/// +/// ``` +/// use arrow_array::{RunArray, StringArray, types::Int32Type}; +/// +/// let orig = ["a", "b", "a", "b"]; +/// let ree_array = RunArray::::from_iter(orig); +/// +/// // `TypedRunArray` allows you to access the values directly +/// let typed = ree_array.downcast_ref::().unwrap(); +/// +/// for (maybe_val, orig) in typed.into_iter().zip(orig) { +/// assert_eq!(maybe_val.unwrap(), orig) +/// } +/// ``` +pub struct TypedRunArray<'a, R: RunEndIndexType, V> { + /// The ree array + run_array: &'a RunArray, + /// The values of the run_array + values: &'a V, +} + +// Manually implement `Clone` to avoid `V: Clone` type constraint +impl<'a, R: RunEndIndexType, V> Clone for TypedRunArray<'a, R, V> { + fn clone(&self) -> Self { + Self { + run_array: self.run_array, + values: self.values, + } + } +} + +impl<'a, R: RunEndIndexType, V> Copy for TypedRunArray<'a, R, V> {} + +impl<'a, R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'a, R, V> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + writeln!(f, "TypedRunArray({:?})", self.run_array) + } +} + +impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { + /// Returns the run_ends of this [`TypedRunArray`] + pub fn run_ends(&self) -> &'a PrimitiveArray { + self.run_array.run_ends() + } + + /// Returns the values of this [`TypedRunArray`] + pub fn values(&self) -> &'a V { + self.values + } +} + +impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { + fn as_any(&self) -> &dyn Any { + self.run_array + } + + fn data(&self) -> &ArrayData { + &self.run_array.data + } + + fn into_data(self) -> ArrayData { + self.run_array.into_data() + } +} + +impl RunArrayAccessor for TypedRunArray<'_, R, V> { + fn physical_len(&self) -> usize { + self.run_ends().len() + } + + fn run_end_index(&self, physical_index: usize) -> Option { + if physical_index >= self.run_ends().len() { + None + } else { + Some(unsafe { + // Safety: + // As the physical_index bounds is checked above + // The array can be accessed without validation + self.run_ends().value_unchecked(physical_index).as_usize() + }) + } + } + + fn is_value_null(&self, physical_index: usize) -> bool { + self.run_array.values().is_null(physical_index) + } +} + +// The array accessor returns value based on physical array index. +// Its the responsibility of the caller of this function to convert from +// logical index to physical index. +impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + type Item = <&'a V as ArrayAccessor>::Item; + + fn value(&self, index: usize) -> Self::Item { + assert!( + index < self.len(), + "Trying to access an element at index {} from a TypedRunArray of length {}", + index, + self.len() + ); + unsafe { self.value_unchecked(index) } + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + self.values.value_unchecked(index) + } +} + +impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V> +where + R: RunEndIndexType, + Self: ArrayAccessor, +{ + type Item = Option<::Item>; + type IntoIter = RunArrayIter; + + fn into_iter(self) -> Self::IntoIter { + RunArrayIter::new(self) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 5cb5dfd0207f..49e3d8e23add 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -43,15 +43,14 @@ use arrow_buffer::ArrowNativeType; /// /// let mut builder = /// GenericByteRunBuilder::::new(); -/// builder.append_value(b"abc"); -/// builder.append_value(b"abc"); -/// builder.append_null(); +/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter()); /// builder.append_value(b"def"); +/// builder.append_null(); /// let array = builder.finish(); /// /// assert_eq!( /// array.run_ends(), -/// &Int16Array::from(vec![Some(2), Some(3), Some(4)]) +/// &Int16Array::from(vec![Some(2), Some(3), Some(5), Some(6)]) /// ); /// /// let av = array.values(); @@ -59,6 +58,7 @@ use arrow_buffer::ArrowNativeType; /// assert!(!av.is_null(0)); /// assert!(av.is_null(1)); /// assert!(!av.is_null(2)); +/// assert!(av.is_null(3)); /// /// // Values are polymorphic and so require a downcast. /// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); @@ -309,6 +309,19 @@ where } } +impl Extend> for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, + S: AsRef, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + /// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). /// /// ``` @@ -324,9 +337,7 @@ where /// // The builder builds the dictionary value by value /// builder.append_value("abc"); /// builder.append_null(); -/// builder.append_value("def"); -/// builder.append_value("def"); -/// builder.append_value("abc"); +/// builder.extend([Some("def"), Some("def"), Some("abc")]); /// let array = builder.finish(); /// /// assert_eq!( @@ -364,9 +375,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// // The builder builds the dictionary value by value /// builder.append_value(b"abc"); /// builder.append_null(); -/// builder.append_value(b"def"); -/// builder.append_value(b"def"); -/// builder.append_value(b"abc"); +/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); /// let array = builder.finish(); /// /// assert_eq!( @@ -395,7 +404,9 @@ mod tests { use super::*; use crate::array::Array; - use crate::types::Int16Type; + use crate::cast::as_primitive_array; + use crate::cast::as_string_array; + use crate::types::{Int16Type, Int32Type}; use crate::GenericByteArray; use crate::Int16Array; use crate::Int16RunArray; @@ -535,4 +546,24 @@ mod tests { fn test_binary_run_buider_finish_cloned() { test_bytes_run_buider_finish_cloned::(vec![b"abc", b"def", b"ghi"]); } + + #[test] + fn test_extend() { + let mut builder = StringRunBuilder::::new(); + builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some)); + builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 10); + assert_eq!( + as_primitive_array::(array.run_ends()).values(), + &[3, 5, 8, 10] + ); + + let str_array = as_string_array(array.values().as_ref()); + assert_eq!(str_array.value(0), "a"); + assert_eq!(str_array.value(1), ""); + assert_eq!(str_array.value(2), "b"); + assert_eq!(str_array.value(3), "cupcakes"); + } } diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 11ce238f4e7f..4d5dcea5bd34 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -265,6 +265,18 @@ where } } +impl Extend> for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + #[cfg(test)] mod tests { use crate::builder::PrimitiveRunBuilder; @@ -308,4 +320,23 @@ mod tests { assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); } + + #[test] + fn test_extend() { + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some)); + builder.extend([4, 4, 6, 2].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + assert_eq!( + as_primitive_array::(array.run_ends()).values(), + &[1, 3, 5, 9, 10, 11] + ); + assert_eq!( + as_primitive_array::(array.values().as_ref()).values(), + &[1, 2, 5, 4, 6, 2] + ); + } } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index d6a9ab30b85b..d8dc6efe25be 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -178,6 +178,7 @@ pub mod cast; mod delta; pub mod iterator; mod raw_pointer; +pub mod run_iterator; pub mod temporal_conversions; pub mod timezone; mod trusted_len; diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs new file mode 100644 index 000000000000..7ac48343cd8d --- /dev/null +++ b/arrow-array/src/run_iterator.rs @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Idiomatic iterator for [`RunArray`](crate::Array) + +use crate::{array::ArrayAccessor, RunArrayAccessor}; + +/// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] +/// +/// # Performance +/// +/// [`RunArrayIter`] provides an idiomatic way to iterate over an array, however, this +/// comes at the cost of performance. In particular the interleaved handling of +/// the null mask is often sub-optimal. +/// +/// If performing an infallible operation, it is typically faster to perform the operation +/// on every index of the array, and handle the null mask separately. For [`PrimitiveArray`] +/// this functionality is provided by [`compute::unary`] +/// +/// If performing a fallible operation, it isn't possible to perform the operation independently +/// of the null mask, as this might result in a spurious failure on a null index. However, +/// there are more efficient ways to iterate over just the non-null indices, this functionality +/// is provided by [`compute::try_unary`] +/// +/// [`PrimitiveArray`]: crate::PrimitiveArray +/// [`compute::unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.unary.html +/// [`compute::try_unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.try_unary.html +#[derive(Debug)] +pub struct RunArrayIter { + array: T, + current_logical: usize, + current_physical: usize, + current_end_logical: usize, + current_end_physical: usize, +} + +impl RunArrayIter { + /// create a new iterator + pub fn new(array: T) -> Self { + let logical_len = array.len(); + let physical_len: usize = array.physical_len(); + RunArrayIter { + array, + current_logical: 0, + current_physical: 0, + current_end_logical: logical_len, + current_end_physical: physical_len, + } + } +} + +impl Iterator for RunArrayIter { + type Item = Option; + + #[inline] + fn next(&mut self) -> Option { + if self.current_logical == self.current_end_logical { + return None; + } + // If current logical index is greater than current run end index then increment + // the physical index. + match self.array.run_end_index(self.current_physical) { + None => { + // The self.current_physical shold not go out of bounds as its + // kept within the bounds of self.current_logical. + panic!( + "Could not get run end index for physical index {}", + self.current_physical + ); + } + Some(run_end_index) if self.current_logical >= run_end_index => { + //As the run_ends is expected to be strictly increasing, there + // should be at least one logical entry in one physical entry. Because of this + // reason we dont have to increment the physical index multiple times to get to next + // logical index. + self.current_physical += 1; + } + _ => {} + } + if self.array.is_value_null(self.current_physical) { + self.current_logical += 1; + Some(None) + } else { + self.current_logical += 1; + // Safety: + // The self.current_physical is kept within bounds of self.current_logical. + // The self.current_logical will not go out of bounds because of the check + // `self.current_logical = self.current_end_logical` above. + unsafe { Some(Some(self.array.value_unchecked(self.current_physical))) } + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + self.current_end_logical - self.current_logical, + Some(self.current_end_logical - self.current_logical), + ) + } +} + +impl DoubleEndedIterator for RunArrayIter { + fn next_back(&mut self) -> Option { + if self.current_end_logical == self.current_logical { + None + } else { + self.current_end_logical -= 1; + if self.current_end_physical > 0 + && self.current_end_logical + < self + .array + .run_end_index(self.current_end_physical - 1) + .unwrap() + { + self.current_end_physical -= 1; + } + Some(if self.array.is_value_null(self.current_end_physical) { + None + } else { + // Safety: + // The check `self.current_end_physical > 0` ensures we don't underflow + // the variable. Also self.current_end_physical starts with array.len() + // and decrements based on the bounds of self.current_end_logical. + unsafe { Some(self.array.value_unchecked(self.current_end_physical)) } + }) + } + } +} + +/// all arrays have known size. +impl ExactSizeIterator for RunArrayIter {} + +#[cfg(test)] +mod tests { + use crate::{ + array::{Int32Array, StringArray}, + builder::PrimitiveRunBuilder, + types::Int32Type, + Int64RunArray, + }; + + #[test] + fn test_primitive_array_iter_round_trip() { + let mut input_vec = vec![ + Some(32), + Some(32), + None, + Some(64), + Some(64), + Some(64), + Some(72), + ]; + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(input_vec.clone().into_iter()); + let ree_array = builder.finish(); + let ree_array = ree_array.downcast_ref::().unwrap(); + + let output_vec: Vec> = ree_array.into_iter().collect(); + assert_eq!(input_vec, output_vec); + + let rev_output_vec: Vec> = ree_array.into_iter().rev().collect(); + input_vec.reverse(); + assert_eq!(input_vec, rev_output_vec); + } + + #[test] + fn test_double_ended() { + let input_vec = vec![ + Some(32), + Some(32), + None, + Some(64), + Some(64), + Some(64), + Some(72), + ]; + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(input_vec.clone().into_iter()); + let ree_array = builder.finish(); + let ree_array = ree_array.downcast_ref::().unwrap(); + + let mut iter = ree_array.into_iter(); + assert_eq!(Some(Some(32)), iter.next()); + assert_eq!(Some(Some(72)), iter.next_back()); + assert_eq!(Some(Some(32)), iter.next()); + assert_eq!(Some(Some(64)), iter.next_back()); + assert_eq!(Some(None), iter.next()); + assert_eq!(Some(Some(64)), iter.next_back()); + assert_eq!(Some(Some(64)), iter.next()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_string_array_iter_round_trip() { + let input_vec = vec!["ab", "ab", "ba", "cc", "cc"]; + let input_ree_array: Int64RunArray = input_vec.into_iter().collect(); + let string_ree_array = input_ree_array.downcast_ref::().unwrap(); + + // to and from iter, with a +1 + let result: Vec> = string_ree_array + .into_iter() + .map(|e| { + e.map(|e| { + let mut a = e.to_string(); + a.push('b'); + a + }) + }) + .collect(); + + let result_asref: Vec> = + result.iter().map(|f| f.as_deref()).collect(); + + let expected_vec = vec![ + Some("abb"), + Some("abb"), + Some("bab"), + Some("ccb"), + Some("ccb"), + ]; + + assert_eq!(expected_vec, result_asref); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ee926ee52868..decfeb949a08 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -237,6 +237,10 @@ required-features = ["test_utils"] name = "string_dictionary_builder" harness = false +[[bench]] +name = "string_run_builder" +harness = false + [[bench]] name = "substring_kernels" harness = false diff --git a/arrow/benches/string_run_builder.rs b/arrow/benches/string_run_builder.rs new file mode 100644 index 000000000000..608de4397d92 --- /dev/null +++ b/arrow/benches/string_run_builder.rs @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::StringRunBuilder; +use arrow::datatypes::Int32Type; +use criterion::{criterion_group, criterion_main, Criterion}; +use rand::{thread_rng, Rng}; + +fn build_strings( + physical_array_len: usize, + logical_array_len: usize, + string_len: usize, +) -> Vec { + let mut rng = thread_rng(); + let run_len = logical_array_len / physical_array_len; + let mut values: Vec = (0..physical_array_len) + .map(|_| (0..string_len).map(|_| rng.gen::()).collect()) + .flat_map(|s| std::iter::repeat(s).take(run_len)) + .collect(); + while values.len() < logical_array_len { + let last_val = values[values.len() - 1].clone(); + values.push(last_val); + } + values +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("string_run_builder"); + + let mut do_bench = + |physical_array_len: usize, logical_array_len: usize, string_len: usize| { + group.bench_function( + format!( + "(run_array_len:{}, physical_array_len:{}, string_len: {})", + logical_array_len, physical_array_len, string_len + ), + |b| { + let strings = + build_strings(physical_array_len, logical_array_len, string_len); + b.iter(|| { + let mut builder = StringRunBuilder::::with_capacity( + physical_array_len, + (string_len + 1) * physical_array_len, + ); + + for val in &strings { + builder.append_value(val); + } + + builder.finish(); + }) + }, + ); + }; + + do_bench(20, 1000, 5); + do_bench(100, 1000, 5); + do_bench(100, 1000, 10); + do_bench(100, 10000, 10); + do_bench(100, 10000, 100); + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 5e61e219397abe34c9c9e77ec8cdaec5da52278f Mon Sep 17 00:00:00 2001 From: ask Date: Mon, 23 Jan 2023 22:48:46 -0500 Subject: [PATCH 20/20] fix clippy issues --- arrow-array/src/run_iterator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 7ac48343cd8d..aa5997ead080 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -188,7 +188,7 @@ mod tests { Some(72), ]; let mut builder = PrimitiveRunBuilder::::new(); - builder.extend(input_vec.clone().into_iter()); + builder.extend(input_vec.into_iter()); let ree_array = builder.finish(); let ree_array = ree_array.downcast_ref::().unwrap();