From 417451e0ebfb7de4181bdef7a23d855ee7bb48c4 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andreweduffy@gmail.com>
Date: Thu, 15 Aug 2024 13:23:55 -0400
Subject: [PATCH 1/4] add fuzzer, fix bug

---
 .github/workflows/fuzz.yml         | 28 ++++++++++++
 fuzz/.gitignore                    |  4 ++
 fuzz/Cargo.lock                    | 70 +++++++++++++++++++++++++++++
 fuzz/Cargo.toml                    | 28 ++++++++++++
 fuzz/fuzz_targets/fuzz_compress.rs |  8 ++++
 fuzz/fuzz_targets/fuzz_train.rs    |  7 +++
 src/lib.rs                         | 72 ++++++++++++++++++++++++------
 7 files changed, 204 insertions(+), 13 deletions(-)
 create mode 100644 .github/workflows/fuzz.yml
 create mode 100644 fuzz/.gitignore
 create mode 100644 fuzz/Cargo.lock
 create mode 100644 fuzz/Cargo.toml
 create mode 100644 fuzz/fuzz_targets/fuzz_compress.rs
 create mode 100644 fuzz/fuzz_targets/fuzz_train.rs

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
new file mode 100644
index 0000000..4d7cc0b
--- /dev/null
+++ b/.github/workflows/fuzz.yml
@@ -0,0 +1,28 @@
+name: Fuzz
+
+on:
+  schedule:
+    - cron: "0 0 * * *" # daily
+  workflow_dispatch:
+
+jobs:
+  fuzz:
+    name: "fuzz"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install cargo fuzz
+        run: cargo install cargo-fuzz
+      - name: Run fuzzing target
+        run: cargo fuzz run fuzz_compress -- -max_total_time=600
+        continue-on-error: true
+      - name: Archive crash artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: fuzzing-crash-artifacts
+          path: fuzz/artifacts
+      - name: Archive fuzzing corpus
+        uses: actions/upload-artifact@v4
+        with:
+          name: fuzzing-corpus
+          path: fuzz/corpus
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
new file mode 100644
index 0000000..1a45eee
--- /dev/null
+++ b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
new file mode 100644
index 0000000..354130a
--- /dev/null
+++ b/fuzz/Cargo.lock
@@ -0,0 +1,70 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+
+[[package]]
+name = "cc"
+version = "1.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68064e60dbf1f17005c2fde4d07c16d8baa506fd7ffed8ccab702d93617975c7"
+dependencies = [
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "fsst-rs"
+version = "0.0.1"
+
+[[package]]
+name = "fsst-rs-fuzz"
+version = "0.0.0"
+dependencies = [
+ "fsst-rs",
+ "libfuzzer-sys",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.155"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
new file mode 100644
index 0000000..0a553c0
--- /dev/null
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "fsst-rs-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.fsst-rs]
+path = ".."
+
+[[bin]]
+name = "fuzz_train"
+path = "fuzz_targets/fuzz_train.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "fuzz_compress"
+path = "fuzz_targets/fuzz_compress.rs"
+test = false
+doc = false
+bench = false
diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs
new file mode 100644
index 0000000..81f4aaf
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_compress.rs
@@ -0,0 +1,8 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
+    let _ = table.compress(data);
+});
diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs
new file mode 100644
index 0000000..fbb6618
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_train.rs
@@ -0,0 +1,7 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = fsst_rs::train(data);
+});
diff --git a/src/lib.rs b/src/lib.rs
index fecb4ad..bdd2667 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -386,10 +386,34 @@ impl SymbolTable {
             remaining_bytes.is_positive(),
             "in_ptr exceeded in_end, should not be possible"
         );
+        let remaining_bytes = remaining_bytes as usize;
 
         // Shift off the remaining bytes
-        let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() };
-        last_word = mask_prefix(last_word, remaining_bytes as usize);
+        // Read the remaining bytes
+        // Unroll and test multiple values being written here.
+        // let mut last_word = [0u8; 8];
+        // for i in 0..remaining_bytes {
+        //     last_word[i as usize] = unsafe { in_ptr.byte_add(i as usize).read() };
+        // }
+
+        // Shift on the words from the remaining bytes.
+        // let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() };
+        // last_word = mask_prefix(last_word, remaining_bytes as usize);
+        // let mut last_word = u64::from_le_bytes(last_word);
+        let mut last_word = unsafe {
+            match remaining_bytes {
+                0 => 0,
+                1 => extract_u64::<1>(in_ptr),
+                2 => extract_u64::<2>(in_ptr),
+                3 => extract_u64::<3>(in_ptr),
+                4 => extract_u64::<4>(in_ptr),
+                5 => extract_u64::<5>(in_ptr),
+                6 => extract_u64::<6>(in_ptr),
+                7 => extract_u64::<7>(in_ptr),
+                8 => extract_u64::<8>(in_ptr),
+                _ => unreachable!("remaining bytes must be <= 8"),
+            }
+        };
 
         while in_ptr < in_end && out_ptr < out_end {
             unsafe {
@@ -466,17 +490,6 @@ impl SymbolTable {
     }
 }
 
-/// Mask the word, keeping only the `prefix_bytes` front.
-fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 {
-    let mask = if prefix_bytes == 0 {
-        0
-    } else {
-        u64::MAX >> (8 * (8 - prefix_bytes))
-    };
-
-    word & mask
-}
-
 fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
     // shift the word off the right-end, because little endian means the first
     // char is stored in the LSB.
@@ -499,3 +512,36 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {
 
     (left & mask) == right
 }
+
+unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
+    match N {
+        1 => ptr.read() as u64,
+        2 => (ptr as *const u16).read_unaligned() as u64,
+        3 => {
+            let low = ptr.read() as u64;
+            let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64;
+            high << 8 | low
+        }
+        4 => {
+            return (ptr as *const u32).read_unaligned() as u64;
+        }
+        5 => {
+            let low = (ptr as *const u32).read_unaligned() as u64;
+            let high = ptr.byte_add(4).read() as u64;
+            high << 32 | low
+        }
+        6 => {
+            let low = (ptr as *const u32).read_unaligned() as u64;
+            let high = (ptr.byte_add(4) as *const u16).read_unaligned() as u64;
+            high << 32 | low
+        }
+        7 => {
+            let low = (ptr as *const u32).read_unaligned() as u64;
+            let mid = (ptr.byte_add(4) as *const u16).read_unaligned() as u64;
+            let high = ptr.byte_add(6).read() as u64;
+            (high << 48) | (mid << 32) | low
+        }
+        8 => (ptr as *const u64).read_unaligned() as u64,
+        _ => unreachable!("N must be <= 8"),
+    }
+}

From e11edb1f9b52ee1e0016970c7b6dbc98d52277c6 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andreweduffy@gmail.com>
Date: Thu, 15 Aug 2024 13:30:43 -0400
Subject: [PATCH 2/4] fix docs + clippy

---
 src/lib.rs           | 24 ++++++++----------------
 tests/correctness.rs |  2 +-
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index bdd2667..1098223 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -388,18 +388,9 @@ impl SymbolTable {
         );
         let remaining_bytes = remaining_bytes as usize;
 
-        // Shift off the remaining bytes
-        // Read the remaining bytes
-        // Unroll and test multiple values being written here.
-        // let mut last_word = [0u8; 8];
-        // for i in 0..remaining_bytes {
-        //     last_word[i as usize] = unsafe { in_ptr.byte_add(i as usize).read() };
-        // }
-
-        // Shift on the words from the remaining bytes.
-        // let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() };
-        // last_word = mask_prefix(last_word, remaining_bytes as usize);
-        // let mut last_word = u64::from_le_bytes(last_word);
+        // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above,
+        // but shift data out of this word rather than advancing an input pointer and potentially reading
+        // unowned memory.
         let mut last_word = unsafe {
             match remaining_bytes {
                 0 => 0,
@@ -513,6 +504,9 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {
     (left & mask) == right
 }
 
+/// This is a function that will get monomorphized based on the value of `N` to do
+/// a load of `N` values from the pointer in a minimum number of instructions into
+/// an output `u64`.
 unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
     match N {
         1 => ptr.read() as u64,
@@ -522,9 +516,7 @@ unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
             let high = (ptr.byte_add(1) as *const u16).read_unaligned() as u64;
             high << 8 | low
         }
-        4 => {
-            return (ptr as *const u32).read_unaligned() as u64;
-        }
+        4 => (ptr as *const u32).read_unaligned() as u64,
         5 => {
             let low = (ptr as *const u32).read_unaligned() as u64;
             let high = ptr.byte_add(4).read() as u64;
@@ -541,7 +533,7 @@ unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
             let high = ptr.byte_add(6).read() as u64;
             (high << 48) | (mid << 32) | low
         }
-        8 => (ptr as *const u64).read_unaligned() as u64,
+        8 => (ptr as *const u64).read_unaligned(),
         _ => unreachable!("N must be <= 8"),
     }
 }
diff --git a/tests/correctness.rs b/tests/correctness.rs
index e168dd6..2f2abf7 100644
--- a/tests/correctness.rs
+++ b/tests/correctness.rs
@@ -48,7 +48,7 @@ fn test_one_byte() {
 #[test]
 fn test_zeros() {
     println!("training zeros");
-    let training_data: Vec<u8> = vec![0, 1, 2, 3, 4];
+    let training_data: Vec<u8> = vec![0, 1, 2, 3, 4, 0];
     let trained = fsst_rs::train(&training_data);
     println!("compressing with zeros");
     let compressed = trained.compress(&[0, 4]);

From 4208f221e2d23213f482740f83d228377d36c549 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andreweduffy@gmail.com>
Date: Thu, 15 Aug 2024 13:33:58 -0400
Subject: [PATCH 3/4] check decompression in fuzzer

---
 fuzz/fuzz_targets/fuzz_compress.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs
index 81f4aaf..23f73bb 100644
--- a/fuzz/fuzz_targets/fuzz_compress.rs
+++ b/fuzz/fuzz_targets/fuzz_compress.rs
@@ -4,5 +4,7 @@ use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
     let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
-    let _ = table.compress(data);
+    let compress = table.compress(data);
+    let decompress = table.decompress(&compress);
+    assert_eq!(&decompress, data);
 });

From af4046bfdbf6ff53b777b7beec0759d0fe91b15f Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andreweduffy@gmail.com>
Date: Thu, 15 Aug 2024 13:40:01 -0400
Subject: [PATCH 4/4] ending zero

---
 tests/correctness.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/correctness.rs b/tests/correctness.rs
index 2f2abf7..a7c7599 100644
--- a/tests/correctness.rs
+++ b/tests/correctness.rs
@@ -51,9 +51,9 @@ fn test_zeros() {
     let training_data: Vec<u8> = vec![0, 1, 2, 3, 4, 0];
     let trained = fsst_rs::train(&training_data);
     println!("compressing with zeros");
-    let compressed = trained.compress(&[0, 4]);
+    let compressed = trained.compress(&[4, 0]);
     println!("decomperssing with zeros");
-    assert_eq!(trained.decompress(&compressed), &[0, 4]);
+    assert_eq!(trained.decompress(&compressed), &[4, 0]);
     println!("done");
 }