Skip to content

Commit

Permalink
update termmap benchmark (#2040)
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz authored May 12, 2023
1 parent fedd955 commit 00c5df6
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 8 deletions.
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ opt-level = 3
debug = false
debug-assertions = false

[profile.bench]
opt-level = 3
debug = true
debug-assertions = false

[profile.test]
debug-assertions = true
overflow-checks = true
Expand Down
3 changes: 1 addition & 2 deletions stacker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ path = "example/hashmap.rs"
rand = "0.8.5"
zipf = "7.0.0"
criterion = "0.4.0"

rustc-hash = "1.1.0"

[features]
unstable = [] # useful for benches.

92 changes: 86 additions & 6 deletions stacker/benches/crit_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ extern crate criterion;

use criterion::*;
use rand::SeedableRng;
use tantivy_stacker::ArenaHashMap;
use rustc_hash::FxHashMap;
use tantivy_stacker::{ArenaHashMap, ExpUnrolledLinkedList, MemoryArena};

const ALICE: &str = include_str!("../../benches/alice.txt");

Expand All @@ -13,15 +14,37 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
let mut group = c.benchmark_group("CreateHashMap");
group.plot_config(plot_config);

let input_name = "alice";
let input_bytes = ALICE.len() as u64;
let alice_terms_as_bytes: Vec<&[u8]> = ALICE
.split_ascii_whitespace()
.map(|el| el.as_bytes())
.collect();

group.throughput(Throughput::Bytes(input_bytes));

group.bench_with_input(
BenchmarkId::new(input_name.to_string(), input_bytes),
&ALICE,
|b, i| b.iter(|| create_hash_map(i.split_whitespace().map(|el| el.as_bytes()))),
BenchmarkId::new("alice".to_string(), input_bytes),
&alice_terms_as_bytes,
|b, i| b.iter(|| create_hash_map(i.iter())),
);
group.bench_with_input(
BenchmarkId::new("alice_expull".to_string(), input_bytes),
&alice_terms_as_bytes,
|b, i| b.iter(|| create_hash_map_with_expull(i.iter())),
);

group.bench_with_input(
BenchmarkId::new("alice_fx_hashmap_ref_expull".to_string(), input_bytes),
&alice_terms_as_bytes,
|b, i| b.iter(|| create_fx_hash_ref_map_with_expull(i.iter().cloned())),
);

group.bench_with_input(
BenchmarkId::new("alice_fx_hashmap_owned_expull".to_string(), input_bytes),
&alice_terms_as_bytes,
|b, i| b.iter(|| create_fx_hash_owned_map_with_expull(i.iter().cloned())),
);

// numbers
let input_bytes = 1_000_000 * 8 as u64;
group.throughput(Throughput::Bytes(input_bytes));
Expand Down Expand Up @@ -50,8 +73,21 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
group.finish();
}

const HASHMAP_SIZE: usize = 1 << 15;

/// Only records the doc ids
#[derive(Clone, Default, Copy)]
pub struct DocIdRecorder {
stack: ExpUnrolledLinkedList,
}
impl DocIdRecorder {
fn new_doc(&mut self, doc: u32, arena: &mut MemoryArena) {
self.stack.writer(arena).write_u32_vint(doc);
}
}

fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
let mut map = ArenaHashMap::with_capacity(4);
let mut map = ArenaHashMap::with_capacity(HASHMAP_SIZE);
for term in terms {
map.mutate_or_create(term.as_ref(), |val| {
if let Some(mut val) = val {
Expand All @@ -66,5 +102,49 @@ fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaH
map
}

fn create_hash_map_with_expull<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
let terms = terms.enumerate();
let mut memory_arena = MemoryArena::default();
let mut map = ArenaHashMap::with_capacity(HASHMAP_SIZE);
for (i, term) in terms {
map.mutate_or_create(term.as_ref(), |val: Option<DocIdRecorder>| {
if let Some(mut rec) = val {
rec.new_doc(i as u32, &mut memory_arena);
rec
} else {
DocIdRecorder::default()
}
});
}

map
}

fn create_fx_hash_ref_map_with_expull<'a>(
terms: impl Iterator<Item = &'static [u8]>,
) -> FxHashMap<&'static [u8], Vec<u32>> {
let terms = terms.enumerate();
let mut map = FxHashMap::with_capacity_and_hasher(HASHMAP_SIZE, Default::default());
for (i, term) in terms {
map.entry(term.as_ref())
.or_insert_with(Vec::new)
.push(i as u32);
}
map
}

fn create_fx_hash_owned_map_with_expull<'a>(
terms: impl Iterator<Item = &'static [u8]>,
) -> FxHashMap<Vec<u8>, Vec<u32>> {
let terms = terms.enumerate();
let mut map = FxHashMap::with_capacity_and_hasher(HASHMAP_SIZE, Default::default());
for (i, term) in terms {
map.entry(term.as_ref().to_vec())
.or_insert_with(Vec::new)
.push(i as u32);
}
map
}

criterion_group!(block_benches, bench_hashmap_throughput,);
criterion_main!(block_benches);
11 changes: 11 additions & 0 deletions stacker/src/arena_hashmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ impl ArenaHashMap {
}

#[inline]
#[cfg(not(feature = "compare_hash_only"))]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Expand All @@ -175,6 +176,16 @@ impl ArenaHashMap {
None
}
}
#[inline]
#[cfg(feature = "compare_hash_only")]
fn get_value_addr_if_key_match(&self, _target_key: &[u8], addr: Addr) -> Option<Addr> {
let data = self.memory_arena.slice_from(addr);
let key_bytes_len_bytes = &data[..2];
let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
let value_addr = addr.offset(2 + key_bytes_len as u32);

Some(value_addr)
}

#[inline]
fn set_bucket(&mut self, hash: HashType, key_value_addr: Addr, bucket: usize) -> UnorderedId {
Expand Down

0 comments on commit 00c5df6

Please sign in to comment.