From e9bf8aa6bf67ec192fce1a6f3e7ab604c9689fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 7 Nov 2024 23:51:41 +0100 Subject: [PATCH] Speed up `filter_bytes` (#6699) * Use vec * Use extend, fix capacity --- arrow-data/src/transform/variable_size.rs | 2 +- arrow-select/src/filter.rs | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index fa1592d973ed..ec0174bf8cb2 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -34,7 +34,7 @@ fn extend_offset_values>( len: usize, ) { let start_values = offsets[start].as_(); - let end_values = offsets[start + len].as_(); + let end_values: usize = offsets[start + len].as_(); let new_values = &values[start_values..end_values]; buffer.extend_from_slice(new_values); } diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index d96dad2f1154..451b044859c4 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -582,7 +582,6 @@ fn filter_native(values: &[T], predicate: &FilterPredicate) } IterationStrategy::Indices(indices) => { let iter = indices.iter().map(|x| values[*x]); - // SAFETY: `Vec::iter` is trusted length unsafe { MutableBuffer::from_trusted_len_iter(iter) } } @@ -618,8 +617,8 @@ where struct FilterBytes<'a, OffsetSize> { src_offsets: &'a [OffsetSize], src_values: &'a [u8], - dst_offsets: MutableBuffer, - dst_values: MutableBuffer, + dst_offsets: Vec, + dst_values: Vec, cur_offset: OffsetSize, } @@ -631,10 +630,10 @@ where where T: ByteArrayType, { - let num_offsets_bytes = (capacity + 1) * std::mem::size_of::(); - let mut dst_offsets = MutableBuffer::new(num_offsets_bytes); - let dst_values = MutableBuffer::new(0); + let dst_values = Vec::new(); + let mut dst_offsets: Vec = Vec::with_capacity(capacity + 1); let cur_offset = OffsetSize::from_usize(0).unwrap(); + dst_offsets.push(cur_offset); Self { @@ -664,13 +663,15 @@ where /// Extends the in-progress array by the indexes in the provided iterator fn extend_idx(&mut self, iter: impl Iterator) { - for idx in iter { - let (start, end, len) = self.get_value_range(idx); + self.dst_offsets.extend(iter.map(|idx| { + let start = self.src_offsets[idx].as_usize(); + let end = self.src_offsets[idx + 1].as_usize(); + let len = OffsetSize::from_usize(end - start).expect("illegal offset range"); self.cur_offset += len; - self.dst_offsets.push(self.cur_offset); self.dst_values .extend_from_slice(&self.src_values[start..end]); - } + self.cur_offset + })); } /// Extends the in-progress array by the ranges in the provided iterator