diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..4d7cc0b --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,28 @@ +name: Fuzz + +on: + schedule: + - cron: "0 0 * * *" # daily + workflow_dispatch: + +jobs: + fuzz: + name: "fuzz" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install cargo fuzz + run: cargo install cargo-fuzz + - name: Run fuzzing target + run: cargo fuzz run fuzz_compress -- -max_total_time=600 + continue-on-error: true + - name: Archive crash artifacts + uses: actions/upload-artifact@v4 + with: + name: fuzzing-crash-artifacts + path: fuzz/artifacts + - name: Archive fuzzing corpus + uses: actions/upload-artifact@v4 + with: + name: fuzzing-corpus + path: fuzz/corpus diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 0000000..1a45eee --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock new file mode 100644 index 0000000..354130a --- /dev/null +++ b/fuzz/Cargo.lock @@ -0,0 +1,70 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "cc" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68064e60dbf1f17005c2fde4d07c16d8baa506fd7ffed8ccab702d93617975c7" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "fsst-rs" +version = "0.0.1" + +[[package]] +name = "fsst-rs-fuzz" +version = "0.0.0" +dependencies = [ + "fsst-rs", + "libfuzzer-sys", +] + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..0a553c0 --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "fsst-rs-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.fsst-rs] +path = ".." + +[[bin]] +name = "fuzz_train" +path = "fuzz_targets/fuzz_train.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "fuzz_compress" +path = "fuzz_targets/fuzz_compress.rs" +test = false +doc = false +bench = false diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs new file mode 100644 index 0000000..23f73bb --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_compress.rs @@ -0,0 +1,10 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes()); + let compress = table.compress(data); + let decompress = table.decompress(&compress); + assert_eq!(&decompress, data); +}); diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs new file mode 100644 index 0000000..fbb6618 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_train.rs @@ -0,0 +1,7 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _ = fsst_rs::train(data); +}); diff --git a/src/lib.rs b/src/lib.rs index fecb4ad..1098223 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -386,10 +386,25 @@ impl SymbolTable { remaining_bytes.is_positive(), "in_ptr exceeded in_end, should not be possible" ); - - // Shift off the remaining bytes - let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; - last_word = mask_prefix(last_word, remaining_bytes as usize); + let remaining_bytes = remaining_bytes as usize; + + // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, + // but shift data out of this word rather than advancing an input pointer and potentially reading + // unowned memory. + let mut last_word = unsafe { + match remaining_bytes { + 0 => 0, + 1 => extract_u64::<1>(in_ptr), + 2 => extract_u64::<2>(in_ptr), + 3 => extract_u64::<3>(in_ptr), + 4 => extract_u64::<4>(in_ptr), + 5 => extract_u64::<5>(in_ptr), + 6 => extract_u64::<6>(in_ptr), + 7 => extract_u64::<7>(in_ptr), + 8 => extract_u64::<8>(in_ptr), + _ => unreachable!("remaining bytes must be <= 8"), + } + }; while in_ptr < in_end && out_ptr < out_end { unsafe { @@ -466,17 +481,6 @@ impl SymbolTable { } } -/// Mask the word, keeping only the `prefix_bytes` front. -fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 { - let mask = if prefix_bytes == 0 { - 0 - } else { - u64::MAX >> (8 * (8 - prefix_bytes)) - }; - - word & mask -} - fn advance_8byte_word(word: u64, bytes: usize) -> u64 { // shift the word off the right-end, because little endian means the first // char is stored in the LSB. @@ -499,3 +503,37 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { (left & mask) == right } + +/// This is a function that will get monomorphized based on the value of `N` to do +/// a load of `N` values from the pointer in a minimum number of instructions into +/// an output `u64`. +unsafe fn extract_u64(ptr: *const u8) -> u64 { + match N { + 1 => ptr.read() as u64, + 2 => (ptr as *const u16).read_unaligned() as u64, + 3 => { + let low = ptr.read() as u64; + let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64; + high << 8 | low + } + 4 => (ptr as *const u32).read_unaligned() as u64, + 5 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let high = ptr.byte_add(4).read() as u64; + high << 32 | low + } + 6 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let high = (ptr.byte_add(4) as *const u16).read_unaligned() as u64; + high << 32 | low + } + 7 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let mid = (ptr.byte_add(4) as *const u16).read_unaligned() as u64; + let high = ptr.byte_add(6).read() as u64; + (high << 48) | (mid << 32) | low + } + 8 => (ptr as *const u64).read_unaligned(), + _ => unreachable!("N must be <= 8"), + } +} diff --git a/tests/correctness.rs b/tests/correctness.rs index e168dd6..a7c7599 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -48,12 +48,12 @@ fn test_one_byte() { #[test] fn test_zeros() { println!("training zeros"); - let training_data: Vec = vec![0, 1, 2, 3, 4]; + let training_data: Vec = vec![0, 1, 2, 3, 4, 0]; let trained = fsst_rs::train(&training_data); println!("compressing with zeros"); - let compressed = trained.compress(&[0, 4]); + let compressed = trained.compress(&[4, 0]); println!("decomperssing with zeros"); - assert_eq!(trained.decompress(&compressed), &[0, 4]); + assert_eq!(trained.decompress(&compressed), &[4, 0]); println!("done"); }