Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: separate Compressor and Decompressor #11

Merged
merged 1 commit into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ license = "Apache-2.0"
repository = "https://github.com/spiraldb/fsst"
edition = "2021"

[lib]
name = "fsst"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nicer than typing use fsst_rs::...


[lints.rust]
warnings = "deny"
missing_docs = "deny"
Expand Down
15 changes: 8 additions & 7 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use core::str;

use criterion::{black_box, criterion_group, criterion_main, Criterion};

use fsst_rs::{train, ESCAPE_CODE};
use fsst::{Compressor, ESCAPE_CODE};

const CORPUS: &str = include_str!("dracula.txt");
const TEST: &str = "I found my smattering of German very useful here";
Expand All @@ -17,31 +17,32 @@ fn bench_fsst(c: &mut Criterion) {
let mut group = c.benchmark_group("fsst");
group.bench_function("train", |b| {
let corpus = CORPUS.as_bytes();
b.iter(|| black_box(train(black_box(corpus))));
b.iter(|| black_box(Compressor::train(black_box(corpus))));
});

let table = train(CORPUS);
let compressor = Compressor::train(CORPUS);
let plaintext = TEST.as_bytes();

let compressed = table.compress(plaintext);
let compressed = compressor.compress(plaintext);
let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count();
let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
println!(
"Escapes = {escape_count}/{}, compression_ratio = {ratio}",
compressed.len()
);

let decompressed = table.decompress(&compressed);
let decompressor = compressor.decompressor();
let decompressed = decompressor.decompress(&compressed);
let decompressed = str::from_utf8(&decompressed).unwrap();
println!("DECODED: {}", decompressed);
assert_eq!(decompressed, TEST);

group.bench_function("compress-single", |b| {
b.iter(|| black_box(table.compress(black_box(plaintext))));
b.iter(|| black_box(compressor.compress(black_box(plaintext))));
});

group.bench_function("decompress-single", |b| {
b.iter(|| black_box(table.decompress(black_box(&compressed))));
b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
});
}

Expand Down
4 changes: 3 additions & 1 deletion examples/file_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use std::{
path::Path,
};

use fsst::Compressor;

fn main() {
let args: Vec<_> = std::env::args().skip(1).collect();
assert!(args.len() >= 2, "args TRAINING and FILE must be provided");
Expand All @@ -33,7 +35,7 @@ fn main() {
}

println!("building the compressor from {train_path:?}...");
let compressor = fsst_rs::train(&train_bytes);
let compressor = Compressor::train(&train_bytes);

println!("compressing blocks of {input_path:?} with compressor...");

Expand Down
6 changes: 4 additions & 2 deletions examples/round_trip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@

use core::str;

use fsst::Compressor;

fn main() {
// Train on a sample.
let sample = "the quick brown fox jumped over the lazy dog";
let trained = fsst_rs::train(sample.as_bytes());
let trained = Compressor::train(sample.as_bytes());
let compressed = trained.compress(sample.as_bytes());
println!("compressed: {} => {}", sample.len(), compressed.len());
// decompress now
let decode = trained.decompress(&compressed);
let decode = trained.decompressor().decompress(&compressed);
let output = str::from_utf8(&decode).unwrap();
println!(
"decoded to the original: len={} text='{}'",
Expand Down
7 changes: 4 additions & 3 deletions fuzz/fuzz_targets/fuzz_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
let compress = table.compress(data);
let decompress = table.decompress(&compress);
let compressor =
fsst::Compressor::train("the quick brown fox jumped over the lazy dog".as_bytes());
let compress = compressor.compress(data);
let decompress = compressor.decompressor().decompress(&compress);
assert_eq!(&decompress, data);
});
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/fuzz_train.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let _ = fsst_rs::train(data);
let _ = fsst::Compressor::train(data);
});
58 changes: 30 additions & 28 deletions src/builder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Functions and types used for building a [`SymbolTable`] from a corpus of text.
//! Functions and types used for building a [`Compressor`] from a corpus of text.
//!
//! This module implements the logic from Algorithm 3 of the [FSST Paper].
//!
Expand All @@ -8,7 +8,7 @@ use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::find_longest::FindLongestSymbol;
use crate::{Symbol, SymbolTable, MAX_CODE};
use crate::{Compressor, Symbol, MAX_CODE};

#[derive(Debug, Clone)]
struct Counter {
Expand Down Expand Up @@ -53,31 +53,33 @@ impl Counter {
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub const MAX_GENERATIONS: usize = 5;

/// Build and train a `SymbolTable` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
/// code).
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable {
let mut table = SymbolTable::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return table;
}
for _generation in 0..MAX_GENERATIONS {
let counter = table.compress_count(sample);
table = table.optimize(counter);
}
impl Compressor {
/// Build and train a `Compressor` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
/// code).
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> Self {
let mut compressor = Self::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return compressor;
}
for _generation in 0..MAX_GENERATIONS {
let counter = compressor.compress_count(sample);
compressor = compressor.optimize(counter);
}

table
compressor
}
}

impl SymbolTable {
impl Compressor {
/// Compress the text using the current symbol table. Count the code occurrences
/// and code-pair occurrences to allow us to calculate apparent gain.
fn compress_count(&self, sample: &[u8]) -> Counter {
Expand All @@ -101,7 +103,7 @@ impl SymbolTable {
/// Using a set of counters and the existing set of symbols, build a new
/// set of symbols/codes that optimizes the gain over the distribution in `counter`.
fn optimize(&self, counters: Counter) -> Self {
let mut res = SymbolTable::default();
let mut res = Compressor::default();
let mut pqueue = BinaryHeap::new();
for code1 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol1 = self.symbols[code1 as usize];
Expand Down Expand Up @@ -186,13 +188,13 @@ impl Ord for Candidate {

#[cfg(test)]
mod test {
use crate::{train, ESCAPE_CODE};
use crate::{Compressor, ESCAPE_CODE};

#[test]
fn test_builder() {
// Train a SymbolTable on the toy string
// Train a Compressor on the toy string
let text = "hello world";
let table = train(text.as_bytes());
let table = Compressor::train(text.as_bytes());

// Use the table to compress a string, see the values
let compressed = table.compress(text.as_bytes());
Expand Down
4 changes: 2 additions & 2 deletions src/find_longest/naive.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::find_longest::FindLongestSymbol;
use crate::SymbolTable;
use crate::Compressor;

// Find the code that maps to a symbol with longest-match to a piece of text.
//
// This is the naive algorithm that just scans the whole table and is very slow.

impl FindLongestSymbol for SymbolTable {
impl FindLongestSymbol for Compressor {
// NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles.
#[inline(never)]
fn find_longest_symbol(&self, text: &[u8]) -> u16 {
Expand Down
Loading
Loading