Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: lazy cache binview bytes len #13830

Merged
merged 1 commit into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/array/binview/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;

use polars_error::PolarsResult;
Expand Down Expand Up @@ -48,7 +49,7 @@ unsafe impl<T: ViewType + ?Sized> ToFfi for BinaryViewArrayGeneric<T> {
buffers: self.buffers.clone(),
raw_buffers: self.raw_buffers.clone(),
phantom: Default::default(),
total_bytes_len: self.total_bytes_len,
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_buffer_len: self.total_buffer_len,
}
}
Expand Down
26 changes: 18 additions & 8 deletions crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod view;
use std::any::Any;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;

use polars_error::*;
Expand Down Expand Up @@ -112,7 +113,7 @@ pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
validity: Option<Bitmap>,
phantom: PhantomData<T>,
/// Total bytes length if we would concatenate them all.
total_bytes_len: usize,
total_bytes_len: AtomicU64,
/// Total bytes in the buffer (excluding remaining capacity)
total_buffer_len: usize,
}
Expand All @@ -132,7 +133,7 @@ impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
raw_buffers: self.raw_buffers.clone(),
validity: self.validity.clone(),
phantom: Default::default(),
total_bytes_len: self.total_bytes_len,
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_buffer_len: self.total_buffer_len,
}
}
Expand All @@ -147,6 +148,7 @@ fn buffers_into_raw<T>(buffers: &[Buffer<T>]) -> Arc<[(*const T, usize)]> {
.map(|buf| (buf.storage_ptr(), buf.len()))
.collect()
}
const UNKNOWN_LEN: u64 = u64::MAX;

impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
/// # Safety
Expand All @@ -169,7 +171,7 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
raw_buffers,
validity,
phantom: Default::default(),
total_bytes_len,
total_bytes_len: AtomicU64::new(total_bytes_len as u64),
total_buffer_len,
}
}
Expand Down Expand Up @@ -336,7 +338,14 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {

/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.
pub fn total_bytes_len(&self) -> usize {
self.total_bytes_len
let total = self.total_bytes_len.load(Ordering::Relaxed);
if total == UNKNOWN_LEN {
let total = self.len_iter().map(|v| v as usize).sum::<usize>();
self.total_bytes_len.store(total as u64, Ordering::Relaxed);
total
} else {
total as usize
}
}

/// Get the length of bytes that are stored in the variadic buffers.
Expand Down Expand Up @@ -367,8 +376,9 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
if self.total_buffer_len == 0 {
return self;
}
let total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed) as usize;
// Subtract the maximum amount of inlined strings.
let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12);
let min_in_buffer = total_bytes_len.saturating_sub(self.len() * 12);
let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64);

if frac < 0.25 {
Expand Down Expand Up @@ -400,7 +410,7 @@ impl BinaryViewArray {
self.views.clone(),
self.buffers.clone(),
self.validity.clone(),
self.total_bytes_len,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)
}
Expand All @@ -415,7 +425,7 @@ impl Utf8ViewArray {
self.views.clone(),
self.buffers.clone(),
self.validity.clone(),
self.total_bytes_len,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)
}
Expand Down Expand Up @@ -460,7 +470,7 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
.filter(|bitmap| bitmap.unset_bits() > 0);
self.views.slice_unchecked(offset, length);
self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::<usize>();
self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed)
}

fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Expand Down
Loading