From 430202394b57245eee1de3870feadacbb4da877d Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Thu, 3 Oct 2024 11:32:14 -0500 Subject: [PATCH] feat: 35% faster decompression with less boundary check (#41) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hi SpiralDB devs! First of all, thank you for implementing the great FSST library! While profiling a decompression workload, I noticed we have some unintended boundary checks that could be removed to improve decompression throughput by 35%, as shown in the `decompress` benchmark below. Removing those boundary checks requires more unsafe code, I've mannually annotated their correctness, but please double check! cc @alamb who might be interested ``` cargo bench --bench micro decompress Compiling fsst-rs v0.4.2 (/home/hao/coding/fsst) Finished `bench` profile [optimized] target(s) in 0.86s Running benches/micro.rs (target/release/deps/micro-bf2c70cedbbe0467) Gnuplot not found, using plotters backend cf=8/decompress time: [47.687 µs 47.725 µs 47.768 µs] thrpt: [20.444 GiB/s 20.462 GiB/s 20.478 GiB/s] change: time: [-26.409% -26.311% -26.208%] (p = 0.00 < 0.05) thrpt: [+35.517% +35.705% +35.887%] Performance has improved. Found 11 outliers among 100 measurements (11.00%) 6 (6.00%) high mild 5 (5.00%) high severe ``` --------- Co-authored-by: Andrew Duffy --- src/lib.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fc8b145..0b77848 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -259,24 +259,30 @@ impl<'a> Decompressor<'a> { let mut in_pos = 0; let mut out_pos = 0; - while in_pos < compressed.len() && out_pos < (decoded.capacity() - size_of::()) { - let code = compressed[in_pos]; + while in_pos < compressed.len() { + // out_pos can grow at most 8 bytes per iteration, and we start at 0 + debug_assert!(out_pos <= decoded.capacity() - size_of::()); + // SAFETY: in_pos is always in range 0..compressed.len() + let code = unsafe { *compressed.get_unchecked(in_pos) }; if code == ESCAPE_CODE { // Advance by one, do raw write. in_pos += 1; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + // SAFETY: ESCAPE_CODE can not be the last byte of the compressed stream unsafe { - let write_addr = ptr.byte_offset(out_pos as isize); - std::ptr::write(write_addr, compressed[in_pos]); + let write_addr = ptr.byte_add(out_pos); + std::ptr::write(write_addr, *compressed.get_unchecked(in_pos)); } out_pos += 1; in_pos += 1; } else { - let symbol = self.symbols[code as usize]; - let length = self.lengths[code as usize]; + // SAFETY: code is in range 0..255 + // The symbol and length tables are both of length 256, so this is safe. + let symbol = unsafe { *self.symbols.get_unchecked(code as usize) }; + let length = unsafe { *self.lengths.get_unchecked(code as usize) }; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer unsafe { - let write_addr = ptr.byte_offset(out_pos as isize) as *mut u64; + let write_addr = ptr.byte_add(out_pos) as *mut u64; // Perform 8 byte unaligned write. write_addr.write_unaligned(symbol.as_u64()); }