Skip to content

Commit

Permalink
fix: hash_table_sizing, inline hints, lint rule (#29)
Browse files Browse the repository at this point in the history
* Fix HASH_TABLE_SIZE to match comment again
* Remove `inline(never)` hints that were added to enable profiling
* Remove lint override that should not have been committed
  • Loading branch information
a10y authored Sep 3, 2024
1 parent bca81cb commit c6392d2
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 26 deletions.
7 changes: 3 additions & 4 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ impl CompressorBuilder {
/// with an existing symbol.
pub fn insert(&mut self, symbol: Symbol, len: usize) -> bool {
assert!(self.n_symbols < 255, "cannot insert into full symbol table");
debug_assert!(len == symbol.len(), "provided len != symbol.len()");
assert_eq!(len, symbol.len(), "provided len must equal symbol.len()");

if len == 2 {
// shortCodes
Expand Down Expand Up @@ -387,7 +387,6 @@ impl CompressorBuilder {
///
/// Also returns the lengths vector, which is of length `n_symbols` and contains the
/// length for each of the values.
#[inline(never)]
fn finalize(&mut self) -> (u8, Vec<u8>) {
// Create a cumulative sum of each of the elements of the input line numbers.
// Do a map that includes the previously seen value as well.
Expand Down Expand Up @@ -534,7 +533,7 @@ const FSST_SAMPLELINE: usize = 512;
/// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures.
#[allow(clippy::ptr_arg)]
fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> {
debug_assert!(
assert!(
sample_buf.capacity() >= FSST_SAMPLEMAX,
"sample_buf.len() < FSST_SAMPLEMAX"
);
Expand Down Expand Up @@ -700,7 +699,7 @@ impl CompressorBuilder {
}

let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) };
debug_assert!(
assert!(
remaining_bytes.is_positive(),
"in_ptr exceeded in_end, should not be possible"
);
Expand Down
21 changes: 5 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,7 @@ impl Symbol {

/// Constructor for a `Symbol` from an 8-element byte slice.
pub fn from_slice(slice: &[u8; 8]) -> Self {
let num: u64 = slice[0] as u64
| (slice[1] as u64) << 8
| (slice[2] as u64) << 16
| (slice[3] as u64) << 24
| (slice[4] as u64) << 32
| (slice[5] as u64) << 40
| (slice[6] as u64) << 48
| (slice[7] as u64) << 56;
let num: u64 = u64::from_le_bytes(*slice);

Self(num)
}
Expand Down Expand Up @@ -106,7 +99,7 @@ impl Symbol {

/// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
pub fn concat(self, other: Self) -> Self {
debug_assert!(
assert!(
self.len() + other.len() <= 8,
"cannot build symbol with length > 8"
);
Expand Down Expand Up @@ -171,9 +164,6 @@ pub const FSST_CODE_BITS: usize = 9;
/// First bit of the "length" portion of an extended code.
pub const FSST_LEN_BITS: usize = 12;

/// A code that never appears in practice, indicating an unused slot.
pub const FSST_CODE_UNUSED: u16 = 1u16 << FSST_CODE_BITS;

/// Maximum code value in the extended code range.
pub const FSST_CODE_MAX: u16 = 1 << FSST_CODE_BITS;

Expand Down Expand Up @@ -253,7 +243,7 @@ impl<'a> Decompressor<'a> {
/// If the provided symbol table has length greater than 256
pub fn new(symbols: &'a [Symbol], lengths: &'a [u8]) -> Self {
assert!(
symbols.len() <= 255,
symbols.len() < FSST_CODE_BASE as usize,
"symbol table cannot have size exceeding 255"
);

Expand Down Expand Up @@ -295,7 +285,7 @@ impl<'a> Decompressor<'a> {
}
}

debug_assert!(
assert!(
in_pos >= compressed.len(),
"decompression should exhaust input before output"
);
Expand Down Expand Up @@ -350,7 +340,7 @@ pub struct Compressor {
/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s.
///
/// The symbol table is trained on a corpus of data in the form of a single byte array, building up
/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols".
/// a mapping of 1-byte "codes" to sequences of up to 8 plaintext bytes, or "symbols".
impl Compressor {
/// Using the symbol table, runs a single cycle of compression on an input word, writing
/// the output into `out_ptr`.
Expand All @@ -367,7 +357,6 @@ impl Compressor {
/// # Safety
///
/// `out_ptr` must never be NULL or otherwise point to invalid memory.
#[inline(never)]
pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) {
// Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and
// if it isn't, it will be overwritten anyway.
Expand Down
8 changes: 2 additions & 6 deletions src/lossy_pht.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
// TODO: remove
#![allow(unused)]

use std::fmt::Debug;

use crate::builder::fsst_hash;
use crate::Code;
use crate::Symbol;
use crate::FSST_CODE_MASK;
use crate::{Code, FSST_CODE_UNUSED};

/// Size of the perfect hash table.
///
/// NOTE: this differs from the paper, which recommends a 64KB total
/// table size. The paper does not account for the fact that most
/// vendors split the L1 cache into 32KB of instruction and 32KB of data.
pub const HASH_TABLE_SIZE: usize = 1 << 12;
pub const HASH_TABLE_SIZE: usize = 1 << 11;

/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`].
///
Expand Down

0 comments on commit c6392d2

Please sign in to comment.