Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add fuzzer, fix bug #7

Merged
merged 4 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/fuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Fuzz

on:
schedule:
- cron: "0 0 * * *" # daily
workflow_dispatch:

jobs:
fuzz:
name: "fuzz"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install cargo fuzz
run: cargo install cargo-fuzz
- name: Run fuzzing target
run: cargo fuzz run fuzz_compress -- -max_total_time=600
continue-on-error: true
- name: Archive crash artifacts
uses: actions/upload-artifact@v4
with:
name: fuzzing-crash-artifacts
path: fuzz/artifacts
- name: Archive fuzzing corpus
uses: actions/upload-artifact@v4
with:
name: fuzzing-corpus
path: fuzz/corpus
4 changes: 4 additions & 0 deletions fuzz/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
target
corpus
artifacts
coverage
70 changes: 70 additions & 0 deletions fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[package]
name = "fsst-rs-fuzz"
version = "0.0.0"
publish = false
edition = "2021"

[package.metadata]
cargo-fuzz = true

[dependencies]
libfuzzer-sys = "0.4"

[dependencies.fsst-rs]
path = ".."

[[bin]]
name = "fuzz_train"
path = "fuzz_targets/fuzz_train.rs"
test = false
doc = false
bench = false

[[bin]]
name = "fuzz_compress"
path = "fuzz_targets/fuzz_compress.rs"
test = false
doc = false
bench = false
8 changes: 8 additions & 0 deletions fuzz/fuzz_targets/fuzz_compress.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#![no_main]

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
let _ = table.compress(data);
});
7 changes: 7 additions & 0 deletions fuzz/fuzz_targets/fuzz_train.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#![no_main]

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let _ = fsst_rs::train(data);
Copy link
Contributor Author

@a10y a10y Aug 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one is slow b/c the train function is still relatively slow, so I don't run this target in CI. once we improve performance to match compress we can turn this back on

});
68 changes: 53 additions & 15 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,25 @@ impl SymbolTable {
remaining_bytes.is_positive(),
"in_ptr exceeded in_end, should not be possible"
);

// Shift off the remaining bytes
let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() };
last_word = mask_prefix(last_word, remaining_bytes as usize);
let remaining_bytes = remaining_bytes as usize;

// Load the last `remaining_byte`s of data into a final world. We then replicate the loop above,
// but shift data out of this word rather than advancing an input pointer and potentially reading
// unowned memory.
let mut last_word = unsafe {
match remaining_bytes {
0 => 0,
1 => extract_u64::<1>(in_ptr),
2 => extract_u64::<2>(in_ptr),
3 => extract_u64::<3>(in_ptr),
4 => extract_u64::<4>(in_ptr),
5 => extract_u64::<5>(in_ptr),
6 => extract_u64::<6>(in_ptr),
7 => extract_u64::<7>(in_ptr),
8 => extract_u64::<8>(in_ptr),
_ => unreachable!("remaining bytes must be <= 8"),
}
};

while in_ptr < in_end && out_ptr < out_end {
unsafe {
Expand Down Expand Up @@ -466,17 +481,6 @@ impl SymbolTable {
}
}

/// Mask the word, keeping only the `prefix_bytes` front.
fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 {
let mask = if prefix_bytes == 0 {
0
} else {
u64::MAX >> (8 * (8 - prefix_bytes))
};

word & mask
}

fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
// shift the word off the right-end, because little endian means the first
// char is stored in the LSB.
Expand All @@ -499,3 +503,37 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {

(left & mask) == right
}

/// This is a function that will get monomorphized based on the value of `N` to do
/// a load of `N` values from the pointer in a minimum number of instructions into
/// an output `u64`.
unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
match N {
1 => ptr.read() as u64,
2 => (ptr as *const u16).read_unaligned() as u64,
3 => {
let low = ptr.read() as u64;
let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64;
high << 8 | low
}
4 => (ptr as *const u32).read_unaligned() as u64,
5 => {
let low = (ptr as *const u32).read_unaligned() as u64;
let high = ptr.byte_add(4).read() as u64;
high << 32 | low
}
6 => {
let low = (ptr as *const u32).read_unaligned() as u64;
let high = (ptr.byte_add(4) as *const u16).read_unaligned() as u64;
high << 32 | low
}
7 => {
let low = (ptr as *const u32).read_unaligned() as u64;
let mid = (ptr.byte_add(4) as *const u16).read_unaligned() as u64;
let high = ptr.byte_add(6).read() as u64;
(high << 48) | (mid << 32) | low
}
8 => (ptr as *const u64).read_unaligned(),
_ => unreachable!("N must be <= 8"),
}
}
2 changes: 1 addition & 1 deletion tests/correctness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fn test_one_byte() {
#[test]
fn test_zeros() {
println!("training zeros");
let training_data: Vec<u8> = vec![0, 1, 2, 3, 4];
let training_data: Vec<u8> = vec![0, 1, 2, 3, 4, 0];
let trained = fsst_rs::train(&training_data);
println!("compressing with zeros");
let compressed = trained.compress(&[0, 4]);
Expand Down
Loading