From 417451e0ebfb7de4181bdef7a23d855ee7bb48c4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 13:23:55 -0400 Subject: [PATCH 1/4] add fuzzer, fix bug --- .github/workflows/fuzz.yml | 28 ++++++++++++ fuzz/.gitignore | 4 ++ fuzz/Cargo.lock | 70 +++++++++++++++++++++++++++++ fuzz/Cargo.toml | 28 ++++++++++++ fuzz/fuzz_targets/fuzz_compress.rs | 8 ++++ fuzz/fuzz_targets/fuzz_train.rs | 7 +++ src/lib.rs | 72 ++++++++++++++++++++++++------ 7 files changed, 204 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/fuzz.yml create mode 100644 fuzz/.gitignore create mode 100644 fuzz/Cargo.lock create mode 100644 fuzz/Cargo.toml create mode 100644 fuzz/fuzz_targets/fuzz_compress.rs create mode 100644 fuzz/fuzz_targets/fuzz_train.rs diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..4d7cc0b --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,28 @@ +name: Fuzz + +on: + schedule: + - cron: "0 0 * * *" # daily + workflow_dispatch: + +jobs: + fuzz: + name: "fuzz" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install cargo fuzz + run: cargo install cargo-fuzz + - name: Run fuzzing target + run: cargo fuzz run fuzz_compress -- -max_total_time=600 + continue-on-error: true + - name: Archive crash artifacts + uses: actions/upload-artifact@v4 + with: + name: fuzzing-crash-artifacts + path: fuzz/artifacts + - name: Archive fuzzing corpus + uses: actions/upload-artifact@v4 + with: + name: fuzzing-corpus + path: fuzz/corpus diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 0000000..1a45eee --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock new file mode 100644 index 0000000..354130a --- /dev/null +++ b/fuzz/Cargo.lock @@ -0,0 +1,70 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "cc" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68064e60dbf1f17005c2fde4d07c16d8baa506fd7ffed8ccab702d93617975c7" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "fsst-rs" +version = "0.0.1" + +[[package]] +name = "fsst-rs-fuzz" +version = "0.0.0" +dependencies = [ + "fsst-rs", + "libfuzzer-sys", +] + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..0a553c0 --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "fsst-rs-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.fsst-rs] +path = ".." + +[[bin]] +name = "fuzz_train" +path = "fuzz_targets/fuzz_train.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "fuzz_compress" +path = "fuzz_targets/fuzz_compress.rs" +test = false +doc = false +bench = false diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs new file mode 100644 index 0000000..81f4aaf --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_compress.rs @@ -0,0 +1,8 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes()); + let _ = table.compress(data); +}); diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs new file mode 100644 index 0000000..fbb6618 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_train.rs @@ -0,0 +1,7 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _ = fsst_rs::train(data); +}); diff --git a/src/lib.rs b/src/lib.rs index fecb4ad..bdd2667 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -386,10 +386,34 @@ impl SymbolTable { remaining_bytes.is_positive(), "in_ptr exceeded in_end, should not be possible" ); + let remaining_bytes = remaining_bytes as usize; // Shift off the remaining bytes - let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; - last_word = mask_prefix(last_word, remaining_bytes as usize); + // Read the remaining bytes + // Unroll and test multiple values being written here. + // let mut last_word = [0u8; 8]; + // for i in 0..remaining_bytes { + // last_word[i as usize] = unsafe { in_ptr.byte_add(i as usize).read() }; + // } + + // Shift on the words from the remaining bytes. + // let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; + // last_word = mask_prefix(last_word, remaining_bytes as usize); + // let mut last_word = u64::from_le_bytes(last_word); + let mut last_word = unsafe { + match remaining_bytes { + 0 => 0, + 1 => extract_u64::<1>(in_ptr), + 2 => extract_u64::<2>(in_ptr), + 3 => extract_u64::<3>(in_ptr), + 4 => extract_u64::<4>(in_ptr), + 5 => extract_u64::<5>(in_ptr), + 6 => extract_u64::<6>(in_ptr), + 7 => extract_u64::<7>(in_ptr), + 8 => extract_u64::<8>(in_ptr), + _ => unreachable!("remaining bytes must be <= 8"), + } + }; while in_ptr < in_end && out_ptr < out_end { unsafe { @@ -466,17 +490,6 @@ impl SymbolTable { } } -/// Mask the word, keeping only the `prefix_bytes` front. -fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 { - let mask = if prefix_bytes == 0 { - 0 - } else { - u64::MAX >> (8 * (8 - prefix_bytes)) - }; - - word & mask -} - fn advance_8byte_word(word: u64, bytes: usize) -> u64 { // shift the word off the right-end, because little endian means the first // char is stored in the LSB. @@ -499,3 +512,36 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { (left & mask) == right } + +unsafe fn extract_u64(ptr: *const u8) -> u64 { + match N { + 1 => ptr.read() as u64, + 2 => (ptr as *const u16).read_unaligned() as u64, + 3 => { + let low = ptr.read() as u64; + let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64; + high << 8 | low + } + 4 => { + return (ptr as *const u32).read_unaligned() as u64; + } + 5 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let high = ptr.byte_add(4).read() as u64; + high << 32 | low + } + 6 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let high = (ptr.byte_add(4) as *const u16).read_unaligned() as u64; + high << 32 | low + } + 7 => { + let low = (ptr as *const u32).read_unaligned() as u64; + let mid = (ptr.byte_add(4) as *const u16).read_unaligned() as u64; + let high = ptr.byte_add(6).read() as u64; + (high << 48) | (mid << 32) | low + } + 8 => (ptr as *const u64).read_unaligned() as u64, + _ => unreachable!("N must be <= 8"), + } +} From e11edb1f9b52ee1e0016970c7b6dbc98d52277c6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 13:30:43 -0400 Subject: [PATCH 2/4] fix docs + clippy --- src/lib.rs | 24 ++++++++---------------- tests/correctness.rs | 2 +- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bdd2667..1098223 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -388,18 +388,9 @@ impl SymbolTable { ); let remaining_bytes = remaining_bytes as usize; - // Shift off the remaining bytes - // Read the remaining bytes - // Unroll and test multiple values being written here. - // let mut last_word = [0u8; 8]; - // for i in 0..remaining_bytes { - // last_word[i as usize] = unsafe { in_ptr.byte_add(i as usize).read() }; - // } - - // Shift on the words from the remaining bytes. - // let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; - // last_word = mask_prefix(last_word, remaining_bytes as usize); - // let mut last_word = u64::from_le_bytes(last_word); + // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, + // but shift data out of this word rather than advancing an input pointer and potentially reading + // unowned memory. let mut last_word = unsafe { match remaining_bytes { 0 => 0, @@ -513,6 +504,9 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { (left & mask) == right } +/// This is a function that will get monomorphized based on the value of `N` to do +/// a load of `N` values from the pointer in a minimum number of instructions into +/// an output `u64`. unsafe fn extract_u64(ptr: *const u8) -> u64 { match N { 1 => ptr.read() as u64, @@ -522,9 +516,7 @@ unsafe fn extract_u64(ptr: *const u8) -> u64 { let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64; high << 8 | low } - 4 => { - return (ptr as *const u32).read_unaligned() as u64; - } + 4 => (ptr as *const u32).read_unaligned() as u64, 5 => { let low = (ptr as *const u32).read_unaligned() as u64; let high = ptr.byte_add(4).read() as u64; @@ -541,7 +533,7 @@ unsafe fn extract_u64(ptr: *const u8) -> u64 { let high = ptr.byte_add(6).read() as u64; (high << 48) | (mid << 32) | low } - 8 => (ptr as *const u64).read_unaligned() as u64, + 8 => (ptr as *const u64).read_unaligned(), _ => unreachable!("N must be <= 8"), } } diff --git a/tests/correctness.rs b/tests/correctness.rs index e168dd6..2f2abf7 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -48,7 +48,7 @@ fn test_one_byte() { #[test] fn test_zeros() { println!("training zeros"); - let training_data: Vec = vec![0, 1, 2, 3, 4]; + let training_data: Vec = vec![0, 1, 2, 3, 4, 0]; let trained = fsst_rs::train(&training_data); println!("compressing with zeros"); let compressed = trained.compress(&[0, 4]); From 4208f221e2d23213f482740f83d228377d36c549 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 13:33:58 -0400 Subject: [PATCH 3/4] check decompression in fuzzer --- fuzz/fuzz_targets/fuzz_compress.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs index 81f4aaf..23f73bb 100644 --- a/fuzz/fuzz_targets/fuzz_compress.rs +++ b/fuzz/fuzz_targets/fuzz_compress.rs @@ -4,5 +4,7 @@ use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes()); - let _ = table.compress(data); + let compress = table.compress(data); + let decompress = table.decompress(&compress); + assert_eq!(&decompress, data); }); From af4046bfdbf6ff53b777b7beec0759d0fe91b15f Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 13:40:01 -0400 Subject: [PATCH 4/4] ending zero --- tests/correctness.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/correctness.rs b/tests/correctness.rs index 2f2abf7..a7c7599 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -51,9 +51,9 @@ fn test_zeros() { let training_data: Vec = vec![0, 1, 2, 3, 4, 0]; let trained = fsst_rs::train(&training_data); println!("compressing with zeros"); - let compressed = trained.compress(&[0, 4]); + let compressed = trained.compress(&[4, 0]); println!("decomperssing with zeros"); - assert_eq!(trained.decompress(&compressed), &[0, 4]); + assert_eq!(trained.decompress(&compressed), &[4, 0]); println!("done"); }