Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Improved perf of take
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Dec 8, 2022
1 parent 3c9c081 commit c150c7c
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 40 deletions.
2 changes: 1 addition & 1 deletion src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub use mutable::*;
/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
/// // the underlying representation:
/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3]));
/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
/// ```
///
Expand Down
50 changes: 34 additions & 16 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::offset::{Offset, Offsets, OffsetsBuffer};
/// Helper trait to support `Offset` and `OffsetBuffer`
pub(crate) trait OffsetsContainer<O> {
fn last(&self) -> usize;
fn starts(&self) -> &[O];
fn as_slice(&self) -> &[O];
}

impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
Expand All @@ -14,9 +14,8 @@ impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
}

#[inline]
fn starts(&self) -> &[O] {
let last = self.buffer().len() - 1;
unsafe { self.buffer().get_unchecked(0..last) }
fn as_slice(&self) -> &[O] {
self.buffer()
}
}

Expand All @@ -27,10 +26,8 @@ impl<O: Offset> OffsetsContainer<O> for Offsets<O> {
}

#[inline]
fn starts(&self) -> &[O] {
let offsets = self.as_slice();
let last = offsets.len() - 1;
unsafe { offsets.get_unchecked(0..last) }
fn as_slice(&self) -> &[O] {
self.as_slice()
}
}

Expand All @@ -52,6 +49,10 @@ pub(crate) fn try_check_utf8<O: Offset, C: OffsetsContainer<O>>(
offsets: &C,
values: &[u8],
) -> Result<()> {
if offsets.as_slice().len() == 1 {
return Ok(());
}

try_check_offsets_bounds(offsets, values.len())?;

if values.is_ascii() {
Expand All @@ -60,17 +61,34 @@ pub(crate) fn try_check_utf8<O: Offset, C: OffsetsContainer<O>>(
simdutf8::basic::from_utf8(values)?;

// offsets can be == values.len()
// let's find first offset from the end that is different
let starts = offsets.starts();
let last = starts
// find first offset from the end that is smaller
// Example:
// values.len() = 10
// offsets = [0, 5, 10, 10]
let offsets = offsets.as_slice();
let last = offsets
.iter()
.rev()
.enumerate()
.find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1))
.unwrap_or(starts.len() - 1);
.skip(1)
.rev()
.find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));

let last = if let Some(last) = last {
// following the example: last = 1 (offset = 5)
last
} else {
// given `l = values.len()`, this branch is hit iff either:
// * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above
// * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above
return Ok(());
};

// trucate to relevant offsets. Note: `=last` because last was computed skipping the first item
// following the example: starts = [0, 5]
let starts = unsafe { offsets.get_unchecked(..=last) };

let mut any_invalid = false;
for start in &starts[..=last] {
for start in starts {
let start = start.to_usize();

// Safety: `try_check_offsets_bounds` just checked for bounds
Expand Down Expand Up @@ -117,7 +135,7 @@ mod tests {

proptest! {
// a bit expensive, feel free to run it when changing the code above
//#![proptest_config(ProptestConfig::with_cases(100000))]
// #![proptest_config(ProptestConfig::with_cases(100000))]
#[test]
#[cfg_attr(miri, ignore)] // miri and proptest do not work well
fn check_utf8_validation(values in binary_strategy()) {
Expand Down
2 changes: 1 addition & 1 deletion src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
/// // the underlying representation
/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
/// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec()));
/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5]));
/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5]));
/// # }
/// ```
///
Expand Down
33 changes: 11 additions & 22 deletions src/compute/take/generic_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ pub fn take_values<O: Offset>(
let mut buffer = Vec::with_capacity(new_len);
starts
.iter()
.zip(offsets.buffer().windows(2))
.for_each(|(start_, window)| {
let start = start_.to_usize();
let end = (*start_ + (window[1] - window[0])).to_usize();
.map(|start| start.to_usize())
.zip(offsets.lengths())
.for_each(|(start, length)| {
let end = start + length;
buffer.extend_from_slice(&values[start..end]);
});
buffer.into()
Expand All @@ -32,27 +32,16 @@ pub fn take_no_validity<O: Offset, I: Index>(
values: &[u8],
indices: &[I],
) -> (OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
let mut length = O::zero();
let mut buffer = Vec::<u8>::new();
let offsets = offsets.buffer();
let offsets = indices.iter().map(|index| {
let index = index.to_usize();
let start = offsets[index];
let length_h = offsets[index + 1] - start;
length += length_h;

let _start = start.to_usize();
let end = (start + length_h).to_usize();
buffer.extend_from_slice(&values[_start..end]);
length
let lengths = indices.iter().map(|index| index.to_usize()).map(|index| {
let (start, end) = offsets.start_end(index);
// todo: remove this bound check
buffer.extend_from_slice(&values[start..end]);
end - start
});
let offsets = std::iter::once(O::zero())
.chain(offsets)
.collect::<Vec<_>>();
// Safety: offsets _are_ monotonically increasing
let offsets = unsafe { Offsets::new_unchecked(offsets) }.into();
let offsets = Offsets::try_from_lengths(lengths).expect("");

(offsets, buffer.into(), None)
(offsets.into(), buffer.into(), None)
}

// take implementation when only values contain nulls
Expand Down
6 changes: 6 additions & 0 deletions src/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,12 @@ impl<O: Offset> OffsetsBuffer<O> {
Self(self.0.slice_unchecked(offset, length))
}

/// Returns an iterator with the lengths of the offsets
#[inline]
pub fn lengths(&self) -> impl Iterator<Item = usize> + '_ {
self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
}

/// Returns the inner [`Buffer`].
#[inline]
pub fn into_inner(self) -> Buffer<O> {
Expand Down
6 changes: 6 additions & 0 deletions tests/it/io/ipc/read/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ fn read_generated_100_decimal() -> Result<()> {
test_file("1.0.0-bigendian", "generated_decimal")
}

#[test]
fn read_generated_duplicate_fieldnames() -> Result<()> {
test_file("1.0.0-littleendian", "generated_duplicate_fieldnames")?;
test_file("1.0.0-bigendian", "generated_duplicate_fieldnames")
}

#[test]
fn read_generated_100_interval() -> Result<()> {
test_file("1.0.0-littleendian", "generated_interval")?;
Expand Down

0 comments on commit c150c7c

Please sign in to comment.