From c150c7c3cb9b25e5db12d79076647275e8561f52 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 7 Dec 2022 22:46:40 +0000 Subject: [PATCH] Improved perf of take --- src/array/binary/mod.rs | 2 +- src/array/specification.rs | 50 ++++++++++++++++++++---------- src/array/utf8/mod.rs | 2 +- src/compute/take/generic_binary.rs | 33 +++++++------------- src/offset.rs | 6 ++++ tests/it/io/ipc/read/file.rs | 6 ++++ 6 files changed, 59 insertions(+), 40 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index a11469b220d..1a5abdcc330 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -40,7 +40,7 @@ pub use mutable::*; /// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); /// // the underlying representation: /// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// ``` /// diff --git a/src/array/specification.rs b/src/array/specification.rs index 966df9f49f2..021cbd5c80c 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -4,7 +4,7 @@ use crate::offset::{Offset, Offsets, OffsetsBuffer}; /// Helper trait to support `Offset` and `OffsetBuffer` pub(crate) trait OffsetsContainer { fn last(&self) -> usize; - fn starts(&self) -> &[O]; + fn as_slice(&self) -> &[O]; } impl OffsetsContainer for OffsetsBuffer { @@ -14,9 +14,8 @@ impl OffsetsContainer for OffsetsBuffer { } #[inline] - fn starts(&self) -> &[O] { - let last = self.buffer().len() - 1; - unsafe { self.buffer().get_unchecked(0..last) } + fn as_slice(&self) -> &[O] { + self.buffer() } } @@ -27,10 +26,8 @@ impl OffsetsContainer for Offsets { } #[inline] - fn starts(&self) -> &[O] { - let offsets = self.as_slice(); - let last = offsets.len() - 1; - unsafe { offsets.get_unchecked(0..last) } + fn as_slice(&self) -> &[O] { + self.as_slice() } } @@ -52,6 +49,10 @@ pub(crate) fn try_check_utf8>( offsets: &C, values: &[u8], ) -> Result<()> { + if offsets.as_slice().len() == 1 { + return Ok(()); + } + try_check_offsets_bounds(offsets, values.len())?; if values.is_ascii() { @@ -60,17 +61,34 @@ pub(crate) fn try_check_utf8>( simdutf8::basic::from_utf8(values)?; // offsets can be == values.len() - // let's find first offset from the end that is different - let starts = offsets.starts(); - let last = starts + // find first offset from the end that is smaller + // Example: + // values.len() = 10 + // offsets = [0, 5, 10, 10] + let offsets = offsets.as_slice(); + let last = offsets .iter() - .rev() .enumerate() - .find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1)) - .unwrap_or(starts.len() - 1); + .skip(1) + .rev() + .find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i)); + + let last = if let Some(last) = last { + // following the example: last = 1 (offset = 5) + last + } else { + // given `l = values.len()`, this branch is hit iff either: + // * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above + // * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above + return Ok(()); + }; + + // trucate to relevant offsets. Note: `=last` because last was computed skipping the first item + // following the example: starts = [0, 5] + let starts = unsafe { offsets.get_unchecked(..=last) }; let mut any_invalid = false; - for start in &starts[..=last] { + for start in starts { let start = start.to_usize(); // Safety: `try_check_offsets_bounds` just checked for bounds @@ -117,7 +135,7 @@ mod tests { proptest! { // a bit expensive, feel free to run it when changing the code above - //#![proptest_config(ProptestConfig::with_cases(100000))] + // #![proptest_config(ProptestConfig::with_cases(100000))] #[test] #[cfg_attr(miri, ignore)] // miri and proptest do not work well fn check_utf8_validation(values in binary_strategy()) { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 36ce27b28bf..f8b8b86a8b8 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -51,7 +51,7 @@ impl> AsRef<[u8]> for StrAsBytes { /// // the underlying representation /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 4fc4d01138d..a9cf9c199c2 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -17,10 +17,10 @@ pub fn take_values( let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.buffer().windows(2)) - .for_each(|(start_, window)| { - let start = start_.to_usize(); - let end = (*start_ + (window[1] - window[0])).to_usize(); + .map(|start| start.to_usize()) + .zip(offsets.lengths()) + .for_each(|(start, length)| { + let end = start + length; buffer.extend_from_slice(&values[start..end]); }); buffer.into() @@ -32,27 +32,16 @@ pub fn take_no_validity( values: &[u8], indices: &[I], ) -> (OffsetsBuffer, Buffer, Option) { - let mut length = O::zero(); let mut buffer = Vec::::new(); - let offsets = offsets.buffer(); - let offsets = indices.iter().map(|index| { - let index = index.to_usize(); - let start = offsets[index]; - let length_h = offsets[index + 1] - start; - length += length_h; - - let _start = start.to_usize(); - let end = (start + length_h).to_usize(); - buffer.extend_from_slice(&values[_start..end]); - length + let lengths = indices.iter().map(|index| index.to_usize()).map(|index| { + let (start, end) = offsets.start_end(index); + // todo: remove this bound check + buffer.extend_from_slice(&values[start..end]); + end - start }); - let offsets = std::iter::once(O::zero()) - .chain(offsets) - .collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); + let offsets = Offsets::try_from_lengths(lengths).expect(""); - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls diff --git a/src/offset.rs b/src/offset.rs index 6d0878fa2b5..2337f082218 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -420,6 +420,12 @@ impl OffsetsBuffer { Self(self.0.slice_unchecked(offset, length)) } + /// Returns an iterator with the lengths of the offsets + #[inline] + pub fn lengths(&self) -> impl Iterator + '_ { + self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) + } + /// Returns the inner [`Buffer`]. #[inline] pub fn into_inner(self) -> Buffer { diff --git a/tests/it/io/ipc/read/file.rs b/tests/it/io/ipc/read/file.rs index 9d21d051f2a..515a6ede92f 100644 --- a/tests/it/io/ipc/read/file.rs +++ b/tests/it/io/ipc/read/file.rs @@ -106,6 +106,12 @@ fn read_generated_100_decimal() -> Result<()> { test_file("1.0.0-bigendian", "generated_decimal") } +#[test] +fn read_generated_duplicate_fieldnames() -> Result<()> { + test_file("1.0.0-littleendian", "generated_duplicate_fieldnames")?; + test_file("1.0.0-bigendian", "generated_duplicate_fieldnames") +} + #[test] fn read_generated_100_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval")?;