feat: patches uses a map in some cases (#1626)

See [this sheet for the data from take_patches.rs](https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU). I'm on an M3 Max with 96 GiB of RAM with macOS 14.4. This threshold likely depends on the ISA. Intuitively, repeated searching is `O(N_INDICES * lg N_PATCHES)` and repeated map lookups is `O(N_INDICES + N_PATCHES)`. It seems to me that the compiler & CPU would have trouble paralleling search (via SIMD or ILP) because of the branching, whereas map lookups are more obviously parallelized (e.g. SIMD hash computation). I'm not entirely sure why the cross over point seems to be around N_PATCHES / N_INDICES = 5. I believe the M3 Max has 128-bit registers, so if the indices are 32-bits then index arithmetic could be 4-way parallel.
spiraldb · Dec 9, 2024 · 0b93fe0 · 0b93fe0
1 parent e8cd434
commit 0b93fe0
Show file tree

Hide file tree

Showing 5 changed files with 236 additions and 11 deletions.
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
@@ -83,3 +83,7 @@ harness = false
 [[bench]]
 name = "take_strings"
 harness = false
+
+[[bench]]
+name = "take_patches"
+harness = false
diff --git a/vortex-array/benches/take_patches.rs b/vortex-array/benches/take_patches.rs
@@ -0,0 +1,71 @@
+#![allow(clippy::unwrap_used)]
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng as _};
+use vortex_array::patches::Patches;
+use vortex_array::ArrayData;
+
+fn fixture(len: usize, sparsity: f64, rng: &mut StdRng) -> Patches {
+    // NB: indices are always ordered
+    let indices = (0..len)
+        .filter(|_| rng.gen_bool(sparsity))
+        .map(|x| x as u64)
+        .collect::<Vec<u64>>();
+    let sparse_len = indices.len();
+    let values = ArrayData::from((0..sparse_len).map(|x| x as u64).collect::<Vec<_>>());
+    Patches::new(len, ArrayData::from(indices), values)
+}
+
+fn indices(array_len: usize, n_indices: usize, rng: &mut StdRng) -> ArrayData {
+    ArrayData::from(
+        (0..n_indices)
+            .map(|_| rng.gen_range(0..(array_len as u64)))
+            .collect::<Vec<u64>>(),
+    )
+}
+
+#[allow(clippy::cast_possible_truncation)]
+fn bench_take(c: &mut Criterion) {
+    let mut group = c.benchmark_group("bench_take");
+    let mut rng = StdRng::seed_from_u64(0);
+
+    for patches_sparsity in [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
+        let patches = fixture(65_535, patches_sparsity, &mut rng);
+        for index_multiple in [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] {
+            let indices = indices(
+                patches.array_len(),
+                (patches.array_len() as f64 * index_multiple) as usize,
+                &mut rng,
+            );
+            group.bench_with_input(
+                BenchmarkId::from_parameter(format!(
+                    "take_search: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
+                    patches.array_len(),
+                    patches.num_patches(),
+                    patches_sparsity,
+                    indices.len(),
+                    index_multiple * 100.0
+                )),
+                &(&patches, &indices),
+                |b, (patches, indices)| b.iter(|| patches.take_search(indices)),
+            );
+            group.bench_with_input(
+                BenchmarkId::from_parameter(format!(
+                    "take_map: array_len={}, n_patches={} (~{}%), n_indices={} ({}%)",
+                    patches.array_len(),
+                    patches.num_patches(),
+                    patches_sparsity,
+                    indices.len(),
+                    index_multiple * 100.0
+                )),
+                &(&patches, &indices),
+                |b, (patches, indices)| b.iter(|| patches.take_map(indices)),
+            );
+        }
+    }
+    group.finish()
+}
+
+criterion_group!(benches, bench_take);
+criterion_main!(benches);
diff --git a/vortex-array/benches/take_strings.rs b/vortex-array/benches/take_strings.rs
@@ -45,5 +45,5 @@ fn bench_varbinview(c: &mut Criterion) {
     });
 }
 
-criterion_group!(bench_take, bench_varbin, bench_varbinview);
-criterion_main!(bench_take);
+criterion_group!(bench_take_strings, bench_varbin, bench_varbinview);
+criterion_main!(bench_take_strings);
diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs
@@ -1,11 +1,15 @@
 use std::fmt::Debug;
+use std::hash::Hash;
 
 use serde::{Deserialize, Serialize};
 use vortex_dtype::Nullability::NonNullable;
-use vortex_dtype::{match_each_integer_ptype, DType, PType};
+use vortex_dtype::{
+    match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, NativeIndexPType, PType,
+};
 use vortex_error::{vortex_bail, VortexExpect, VortexResult};
 use vortex_scalar::Scalar;
 
+use crate::aliases::hash_map::HashMap;
 use crate::array::PrimitiveArray;
 use crate::compute::{
     scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice,
@@ -220,15 +224,33 @@ impl Patches {
         Ok(Some(Self::new(stop - start, indices, values)))
     }
 
+    // https://docs.google.com/spreadsheets/d/1D9vBZ1QJ6mwcIvV5wIL0hjGgVchcEnAyhvitqWu2ugU
+    const PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN: f64 = 5.0;
+
+    fn is_map_faster_than_search(&self, take_indices: &ArrayData) -> bool {
+        (self.num_patches() as f64 / take_indices.len() as f64)
+            < Self::PREFER_MAP_WHEN_PATCHES_OVER_INDICES_LESS_THAN
+    }
+
     /// Take the indices from the patches.
-    pub fn take(&self, indices: &ArrayData) -> VortexResult<Option<Self>> {
-        if indices.is_empty() {
+    pub fn take(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
+        if self.is_map_faster_than_search(take_indices) {
+            self.take_map(take_indices)
+        } else {
+            self.take_search(take_indices)
+        }
+    }
+
+    pub fn take_search(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
+        let new_length = take_indices.len();
+
+        if take_indices.is_empty() {
             return Ok(None);
         }
 
         // TODO(ngates): plenty of optimisations to be made here
         let take_indices =
-            try_cast(indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;
+            try_cast(take_indices, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?;
 
         let (values_indices, new_indices): (Vec<u64>, Vec<u64>) = search_sorted_usize_many(
             self.indices(),
@@ -258,7 +280,61 @@ impl Patches {
             PrimitiveArray::from_vec(values_indices, Validity::NonNullable).into_array();
         let new_values = take(self.values(), values_indices)?;
 
-        Ok(Some(Self::new(indices.len(), new_indices, new_values)))
+        Ok(Some(Self::new(new_length, new_indices, new_values)))
+    }
+
+    pub fn take_map(&self, take_indices: &ArrayData) -> VortexResult<Option<Self>> {
+        let take_indices = take_indices.clone().into_primitive()?;
+        match_each_unsigned_integer_ptype!(self.indices_ptype(), |$INDICES| {
+            let indices = self.indices
+                .clone()
+                .into_primitive()?;
+            let indices = indices
+                .maybe_null_slice::<$INDICES>();
+            match_each_unsigned_integer_ptype!(take_indices.ptype(), |$TAKE_INDICES| {
+                let take_indices = take_indices
+                    .into_primitive()?;
+                let take_indices = take_indices
+                    .maybe_null_slice::<$TAKE_INDICES>();
+                return self.take_map_helper(indices, take_indices);
+            });
+        });
+    }
+
+    fn take_map_helper<I: NativeIndexPType + Hash + Eq, TI: NativeIndexPType>(
+        &self,
+        indices: &[I],
+        take_indices: &[TI],
+    ) -> VortexResult<Option<Self>> {
+        let new_length = take_indices.len();
+        let sparse_index_to_value_index: HashMap<I, usize> = indices
+            .iter()
+            .enumerate()
+            .map(|(value_index, sparse_index)| (*sparse_index, value_index))
+            .collect();
+        let min_index = self.min_index()?;
+        let max_index = self.max_index()?;
+        let (new_sparse_indices, value_indices): (Vec<u64>, Vec<u64>) = take_indices
+            .iter()
+            .enumerate()
+            .filter(|(_, ti)| ti.as_usize() < min_index || ti.as_usize() > max_index)
+            .filter_map(|(new_sparse_index, take_sparse_index)| {
+                sparse_index_to_value_index
+                    .get(&I::usize_as(take_sparse_index.as_usize()))
+                    // FIXME(DK): should we choose a small index width or should the compressor do that?
+                    .map(|value_index| (new_sparse_index as u64, *value_index as u64))
+            })
+            .unzip();
+
+        if new_sparse_indices.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(Patches::new(
+            new_length,
+            ArrayData::from(new_sparse_indices),
+            take(self.values(), ArrayData::from(value_indices))?,
+        )))
     }
 }
 

diff --git a/vortex-dtype/src/ptype.rs b/vortex-dtype/src/ptype.rs
@@ -83,6 +83,40 @@ pub trait NativePType:
     fn is_eq(self, other: Self) -> bool;
 }
 
+/// A trait for native Rust types that correspond 1:1 to a PType used for indexing.
+///
+/// Vortex assumes that every value used as an index is representable as both a usize and a u64.
+pub trait NativeIndexPType: NativePType {
+    /// Convert this value to a usize.
+    ///
+    /// # Safety
+    ///
+    /// 1. Assumes that the value is a valid index into an array on this machine.
+    fn as_usize(self) -> usize;
+
+    /// Convert this value to a u64.
+    ///
+    /// # Safety
+    ///
+    /// 1. Assumes that the value is a valid index into an array on this machine.
+    /// 2. Assumes this machine's pointers are not larger than u64.
+    fn as_u64(self) -> u64;
+
+    /// Convert a usize to this type.
+    ///
+    /// # Safety
+    ///
+    /// 1. Assumes that the value is small enough to fit in this type.
+    fn usize_as(value: usize) -> Self;
+
+    /// Convert a u64 to this type.
+    ///
+    /// # Safety
+    ///
+    /// 1. Assumes that the value is small enough to fit in this type.
+    fn u64_as(value: u64) -> Self;
+}
+
 macro_rules! native_ptype {
     ($T:ty, $ptype:tt, $arrow:tt) => {
         impl NativePType for $T {
@@ -104,6 +138,46 @@ macro_rules! native_ptype {
     };
 }
 
+macro_rules! native_index_ptype {
+    ($T:ty, $ptype:tt, $arrow:tt) => {
+        impl NativePType for $T {
+            const PTYPE: PType = PType::$ptype;
+            type ArrowPrimitiveType = $arrow;
+
+            fn is_nan(self) -> bool {
+                false
+            }
+
+            fn total_compare(self, other: Self) -> Ordering {
+                self.cmp(&other)
+            }
+
+            fn is_eq(self, other: Self) -> bool {
+                self == other
+            }
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        impl NativeIndexPType for $T {
+            fn as_usize(self) -> usize {
+                self as usize
+            }
+
+            fn as_u64(self) -> u64 {
+                self as u64
+            }
+
+            fn usize_as(value: usize) -> Self {
+                value as Self
+            }
+
+            fn u64_as(value: u64) -> Self {
+                value as Self
+            }
+        }
+    };
+}
+
 macro_rules! native_float_ptype {
     ($T:ty, $ptype:tt, $arrow:tt) => {
         impl NativePType for $T {
@@ -125,10 +199,10 @@ macro_rules! native_float_ptype {
     };
 }
 
-native_ptype!(u8, U8, UInt8Type);
-native_ptype!(u16, U16, UInt16Type);
-native_ptype!(u32, U32, UInt32Type);
-native_ptype!(u64, U64, UInt64Type);
+native_index_ptype!(u8, U8, UInt8Type);
+native_index_ptype!(u16, U16, UInt16Type);
+native_index_ptype!(u32, U32, UInt32Type);
+native_index_ptype!(u64, U64, UInt64Type);
 native_ptype!(i8, I8, Int8Type);
 native_ptype!(i16, I16, Int16Type);
 native_ptype!(i32, I32, Int32Type);