[Turbopack] Custom persistence layer (#73029)

This adds a new custom database implementation for Persistent Caching. Why is this better than an existing database? It can be implemented especially for our use case and can come with assumptions and restrictions that other database can't have. What is special about our use case? * We only do one write at a time, but in a single very large transaction that can potentially push GBs into the database. * We want to fill that large transaction from multiple threads. * The initial cold build is very important from performance perspective. How do we tackle that? * We only allow a single WriteBatch at a time, but we start writing to disk while filling that write batch. * When we commit the WriteBatch we write a sequence number to make these writes visible * Once written and committed, files are immutable (but they can be deleted) * Every WriteBatch writes additional files that logically override the values from earlier files. (Deletions are stored as tombstones) * When the average files to read reaches a threshold we do a compaction. * A compaction runs a merge on multiple files to create new sorted files. This reduces that metric. * We limit the number of merged files to avoid long compactions. * In every file we store a key range, an AMQF to quickly find out if a key can be in that file. The false positive rate per file is 0.1%. * When we need to lookup a key in a file we do a binary search as keys are stored in sorted order (sorted by their hash). * Files are split into blocks that are stored compressed with lz4 with a two shared compression dictionary per file (one for keys and one for values) * We have an additional index block to find the right key block without a search. * We support multiple key families to split the database for different kinds of data. * Depending on the size of the value it will be stored: 1. in a block with other small values, 2. in it's own block, 3. in a separate file. * We have a block cache to cache decompressed blocks. * We have a AMQF to cache deserialized filters * Files are memory mapped for reading to leverage OS cache and memory See more details in the added README.md file.
vercel · Dec 2, 2024 · b5443b2 · b5443b2
1 parent 32c9b7b
commit b5443b2
Show file tree

Hide file tree

Showing 27 changed files with 4,471 additions and 43 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -43,6 +43,7 @@ auto-hash-map = { path = "turbopack/crates/turbo-tasks-auto-hash-map" }
 swc-ast-explorer = { path = "turbopack/crates/turbopack-swc-ast-explorer" }
 turbo-prehash = { path = "turbopack/crates/turbo-prehash" }
 turbo-rcstr = { path = "turbopack/crates/turbo-rcstr" }
+turbo-persistence = { path = "turbopack/crates/turbo-persistence" }
 turbo-tasks-malloc = { path = "turbopack/crates/turbo-tasks-malloc", default-features = false }
 turbo-tasks = { path = "turbopack/crates/turbo-tasks" }
 turbo-tasks-backend = { path = "turbopack/crates/turbo-tasks-backend" }

diff --git a/turbopack/crates/turbo-persistence/Cargo.toml b/turbopack/crates/turbo-persistence/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "turbo-persistence"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+
+[features]
+verify_sst_content = []
+strict_checks = []
+stats = ["quick_cache/stats"]
+
+[dependencies]
+anyhow = { workspace = true }
+pot = "3.0.0"
+byteorder = "1.5.0"
+lzzzz = "1.1.0"
+memmap2 = "0.9.5"
+parking_lot = { workspace = true }
+qfilter = { version = "0.2.1", features = ["serde"] }
+quick_cache = { version = "0.6.9" }
+rayon = { workspace = true }
+rustc-hash = { workspace = true }
+serde = { workspace = true }
+thread_local = { workspace = true }
+twox-hash = { version = "2.0.1", features = ["xxhash64"] }
+zstd = { version = "0.13.2", features = ["zdict_builder"] }
+
+[dev-dependencies]
+rand = { workspace = true, features = ["small_rng"] }
+tempfile = "3.14.0"
+
+[lints]
+workspace = true