Skip to content

Commit

Permalink
feat: patches uses a map in some cases (#1626)
Browse files Browse the repository at this point in the history
See [this sheet for the data from
take_patches.rs](https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU).
I'm on an M3 Max with 96 GiB of RAM with macOS 14.4. This threshold
likely depends on the ISA.

Intuitively, repeated searching is `O(N_INDICES * lg N_PATCHES)` and
repeated map lookups is `O(N_INDICES + N_PATCHES)`. It seems to me that
the compiler & CPU would have trouble paralleling search (via SIMD or
ILP) because of the branching, whereas map lookups are more obviously
parallelized (e.g. SIMD hash computation). I'm not entirely sure why the
cross over point seems to be around N_PATCHES / N_INDICES = 5. I believe
the M3 Max has 128-bit registers, so if the indices are 32-bits then
index arithmetic could be 4-way parallel.
  • Loading branch information
danking authored Dec 9, 2024
1 parent e8cd434 commit 0b93fe0
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 11 deletions.
4 changes: 4 additions & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,7 @@ harness = false
[[bench]]
name = "take_strings"
harness = false

[[bench]]
name = "take_patches"
harness = false
71 changes: 71 additions & 0 deletions vortex-array/benches/take_patches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#![allow(clippy::unwrap_used)]

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng as _};
use vortex_array::patches::Patches;
use vortex_array::ArrayData;

fn fixture(len: usize, sparsity: f64, rng: &mut StdRng) -> Patches {
// NB: indices are always ordered
let indices = (0..len)
.filter(|_| rng.gen_bool(sparsity))
.map(|x| x as u64)
.collect::<Vec<u64>>();
let sparse_len = indices.len();
let values = ArrayData::from((0..sparse_len).map(|x| x as u64).collect::<Vec<_>>());
Patches::new(len, ArrayData::from(indices), values)
}

fn indices(array_len: usize, n_indices: usize, rng: &mut StdRng) -> ArrayData {
ArrayData::from(
(0..n_indices)
.map(|_| rng.gen_range(0..(array_len as u64)))
.collect::<Vec<u64>>(),
)
}

#[allow(clippy::cast_possible_truncation)]
fn bench_take(c: &mut Criterion) {
let mut group = c.benchmark_group("bench_take");
let mut rng = StdRng::seed_from_u64(0);

for patches_sparsity in [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
let patches = fixture(65_535, patches_sparsity, &mut rng);
for index_multiple in [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
let indices = indices(
patches.array_len(),
(patches.array_len() as f64 * index_multiple) as usize,
&mut rng,
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"take_search: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
patches.array_len(),
patches.num_patches(),
patches_sparsity,
indices.len(),
index_multiple * 100.0
)),
&(&patches, &indices),
|b, (patches, indices)| b.iter(|| patches.take_search(indices)),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"take_map: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
patches.array_len(),
patches.num_patches(),
patches_sparsity,
indices.len(),
index_multiple * 100.0
)),
&(&patches, &indices),
|b, (patches, indices)| b.iter(|| patches.take_map(indices)),
);
}
}
group.finish()
}

criterion_group!(benches, bench_take);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions vortex-array/benches/take_strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ fn bench_varbinview(c: &mut Criterion) {
});
}

criterion_group!(bench_take, bench_varbin, bench_varbinview);
criterion_main!(bench_take);
criterion_group!(bench_take_strings, bench_varbin, bench_varbinview);
criterion_main!(bench_take_strings);
86 changes: 81 additions & 5 deletions vortex-array/src/patches.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use std::fmt::Debug;
use std::hash::Hash;

use serde::{Deserialize, Serialize};
use vortex_dtype::Nullability::NonNullable;
use vortex_dtype::{match_each_integer_ptype, DType, PType};
use vortex_dtype::{
match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, NativeIndexPType, PType,
};
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
use vortex_scalar::Scalar;

use crate::aliases::hash_map::HashMap;
use crate::array::PrimitiveArray;
use crate::compute::{
scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice,
Expand Down Expand Up @@ -220,15 +224,33 @@ impl Patches {
Ok(Some(Self::new(stop - start, indices, values)))
}

// https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU
const PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN: f64 = 5.0;

fn is_map_faster_than_search(&self, take_indices: &ArrayData) -> bool {
(self.num_patches() as f64 / take_indices.len() as f64)
< Self::PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN
}

/// Take the indices from the patches.
pub fn take(&self, indices: &ArrayData) -> VortexResult<Option<Self>> {
if indices.is_empty() {
pub fn take(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
if self.is_map_faster_than_search(take_indices) {
self.take_map(take_indices)
} else {
self.take_search(take_indices)
}
}

pub fn take_search(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
let new_length = take_indices.len();

if take_indices.is_empty() {
return Ok(None);
}

// TODO(ngates): plenty of optimisations to be made here
let take_indices =
try_cast(indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;
try_cast(take_indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;

let (values_indices, new_indices): (Vec<u64>, Vec<u64>) = search_sorted_usize_many(
self.indices(),
Expand Down Expand Up @@ -258,7 +280,61 @@ impl Patches {
PrimitiveArray::from_vec(values_indices, Validity::NonNullable).into_array();
let new_values = take(self.values(), values_indices)?;

Ok(Some(Self::new(indices.len(), new_indices, new_values)))
Ok(Some(Self::new(new_length, new_indices, new_values)))
}

pub fn take_map(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
let take_indices = take_indices.clone().into_primitive()?;
match_each_unsigned_integer_ptype!(self.indices_ptype(), |$INDICES| {
let indices = self.indices
.clone()
.into_primitive()?;
let indices = indices
.maybe_null_slice::<$INDICES>();
match_each_unsigned_integer_ptype!(take_indices.ptype(), |$TAKE_INDICES| {
let take_indices = take_indices
.into_primitive()?;
let take_indices = take_indices
.maybe_null_slice::<$TAKE_INDICES>();
return self.take_map_helper(indices, take_indices);
});
});
}

fn take_map_helper<I: NativeIndexPType + Hash + Eq, TI: NativeIndexPType>(
&self,
indices: &[I],
take_indices: &[TI],
) -> VortexResult<Option<Self>> {
let new_length = take_indices.len();
let sparse_index_to_value_index: HashMap<I, usize> = indices
.iter()
.enumerate()
.map(|(value_index, sparse_index)| (*sparse_index, value_index))
.collect();
let min_index = self.min_index()?;
let max_index = self.max_index()?;
let (new_sparse_indices, value_indices): (Vec<u64>, Vec<u64>) = take_indices
.iter()
.enumerate()
.filter(|(_, ti)| ti.as_usize() < min_index || ti.as_usize() > max_index)
.filter_map(|(new_sparse_index, take_sparse_index)| {
sparse_index_to_value_index
.get(&I::usize_as(take_sparse_index.as_usize()))
// FIXME(DK): should we choose a small index width or should the compressor do that?
.map(|value_index| (new_sparse_index as u64, *value_index as u64))
})
.unzip();

if new_sparse_indices.is_empty() {
return Ok(None);
}

Ok(Some(Patches::new(
new_length,
ArrayData::from(new_sparse_indices),
take(self.values(), ArrayData::from(value_indices))?,
)))
}
}

Expand Down
82 changes: 78 additions & 4 deletions vortex-dtype/src/ptype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,40 @@ pub trait NativePType:
fn is_eq(self, other: Self) -> bool;
}

/// A trait for native Rust types that correspond 1:1 to a PType used for indexing.
///
/// Vortex assumes that every value used as an index is representable as both a usize and a u64.
pub trait NativeIndexPType: NativePType {
/// Convert this value to a usize.
///
/// # Safety
///
/// 1. Assumes that the value is a valid index into an array on this machine.
fn as_usize(self) -> usize;

/// Convert this value to a u64.
///
/// # Safety
///
/// 1. Assumes that the value is a valid index into an array on this machine.
/// 2. Assumes this machine's pointers are not larger than u64.
fn as_u64(self) -> u64;

/// Convert a usize to this type.
///
/// # Safety
///
/// 1. Assumes that the value is small enough to fit in this type.
fn usize_as(value: usize) -> Self;

/// Convert a u64 to this type.
///
/// # Safety
///
/// 1. Assumes that the value is small enough to fit in this type.
fn u64_as(value: u64) -> Self;
}

macro_rules! native_ptype {
($T:ty, $ptype:tt, $arrow:tt) => {
impl NativePType for $T {
Expand All @@ -104,6 +138,46 @@ macro_rules! native_ptype {
};
}

macro_rules! native_index_ptype {
($T:ty, $ptype:tt, $arrow:tt) => {
impl NativePType for $T {
const PTYPE: PType = PType::$ptype;
type ArrowPrimitiveType = $arrow;

fn is_nan(self) -> bool {
false
}

fn total_compare(self, other: Self) -> Ordering {
self.cmp(&other)
}

fn is_eq(self, other: Self) -> bool {
self == other
}
}

#[allow(clippy::cast_possible_truncation)]
impl NativeIndexPType for $T {
fn as_usize(self) -> usize {
self as usize
}

fn as_u64(self) -> u64 {
self as u64
}

fn usize_as(value: usize) -> Self {
value as Self
}

fn u64_as(value: u64) -> Self {
value as Self
}
}
};
}

macro_rules! native_float_ptype {
($T:ty, $ptype:tt, $arrow:tt) => {
impl NativePType for $T {
Expand All @@ -125,10 +199,10 @@ macro_rules! native_float_ptype {
};
}

native_ptype!(u8, U8, UInt8Type);
native_ptype!(u16, U16, UInt16Type);
native_ptype!(u32, U32, UInt32Type);
native_ptype!(u64, U64, UInt64Type);
native_index_ptype!(u8, U8, UInt8Type);
native_index_ptype!(u16, U16, UInt16Type);
native_index_ptype!(u32, U32, UInt32Type);
native_index_ptype!(u64, U64, UInt64Type);
native_ptype!(i8, I8, Int8Type);
native_ptype!(i16, I16, Int16Type);
native_ptype!(i32, I32, Int32Type);
Expand Down

0 comments on commit 0b93fe0

Please sign in to comment.