diff --git a/.github/workflows/ecc.yml b/.github/workflows/ecc.yml
index 25e5b59867..5269998bd3 100644
--- a/.github/workflows/ecc.yml
+++ b/.github/workflows/ecc.yml
@@ -66,4 +66,4 @@ jobs:
       - name: Build openvm-ecc-guest crate for openvm
         working-directory: extensions/ecc/guest
         run: |
-          cargo openvm build
+          cargo openvm build --no-transpile
diff --git a/Cargo.lock b/Cargo.lock
index ff542fd2b6..8e7a4c094c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1082,6 +1082,7 @@ dependencies = [
  "eyre",
  "goblin",
  "hex",
+ "num-bigint-dig",
  "openvm-build",
  "openvm-circuit",
  "openvm-cli-example-test",
@@ -3580,6 +3581,7 @@ dependencies = [
  "parking_lot",
  "rand",
  "rand_xoshiro",
+ "rayon",
  "rustc-hash 2.1.0",
  "serde",
  "static_assertions",
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..49fac70f94
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 The OpenVM Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/assets/agg-2.png b/assets/agg-2.png
new file mode 100644
index 0000000000..11fcc8de81
Binary files /dev/null and b/assets/agg-2.png differ
diff --git a/assets/agg.png b/assets/agg.png
new file mode 100644
index 0000000000..2e03723445
Binary files /dev/null and b/assets/agg.png differ
diff --git a/benchmarks/src/bin/base64_json.rs b/benchmarks/src/bin/base64_json.rs
index 2fcc6e8ba4..a8b5e93528 100644
--- a/benchmarks/src/bin/base64_json.rs
+++ b/benchmarks/src/bin/base64_json.rs
@@ -39,7 +39,8 @@ fn main() -> Result<()> {
             .with_extension(Keccak256TranspilerExtension),
     )?;
     let app_config = AppConfig {
-        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup),
+        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup)
+            .into(),
         app_vm_config: Keccak256Rv32Config::default(),
         leaf_fri_params: FriParameters::standard_with_100_bits_conjectured_security(agg_log_blowup)
             .into(),
diff --git a/benchmarks/src/bin/bincode.rs b/benchmarks/src/bin/bincode.rs
index 65ca4b318b..3db752b6e6 100644
--- a/benchmarks/src/bin/bincode.rs
+++ b/benchmarks/src/bin/bincode.rs
@@ -30,7 +30,7 @@ fn main() -> Result<()> {
     };
 
     let app_config = AppConfig {
-        app_fri_params,
+        app_fri_params: app_fri_params.into(),
         app_vm_config: Rv32ImConfig::default(),
         leaf_fri_params: leaf_fri_params.into(),
         compiler_options,
diff --git a/benchmarks/src/bin/ecrecover.rs b/benchmarks/src/bin/ecrecover.rs
index 1780882201..46c9301a38 100644
--- a/benchmarks/src/bin/ecrecover.rs
+++ b/benchmarks/src/bin/ecrecover.rs
@@ -121,7 +121,8 @@ fn main() -> Result<()> {
     )?;
     // TODO: update sw_setup macros and read it from elf.
     let vm_config = AppConfig {
-        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup),
+        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup)
+            .into(),
         app_vm_config: Rv32ImEcRecoverConfig::for_curves(vec![SECP256K1_CONFIG.clone()]),
         leaf_fri_params: FriParameters::standard_with_100_bits_conjectured_security(agg_log_blowup)
             .into(),
diff --git a/benchmarks/src/bin/fib_e2e.rs b/benchmarks/src/bin/fib_e2e.rs
index 188119eae1..3a33d93b04 100644
--- a/benchmarks/src/bin/fib_e2e.rs
+++ b/benchmarks/src/bin/fib_e2e.rs
@@ -49,7 +49,7 @@ async fn main() -> Result<()> {
     let max_segment_length = cli_args.max_segment_length.unwrap_or(1_000_000);
 
     let app_config = AppConfig {
-        app_fri_params,
+        app_fri_params: app_fri_params.into(),
         app_vm_config: Rv32ImConfig::with_public_values_and_segment_len(
             NUM_PUBLIC_VALUES,
             max_segment_length,
diff --git a/benchmarks/src/bin/fibonacci.rs b/benchmarks/src/bin/fibonacci.rs
index 93ffa3e746..e894b18c5e 100644
--- a/benchmarks/src/bin/fibonacci.rs
+++ b/benchmarks/src/bin/fibonacci.rs
@@ -51,7 +51,7 @@ fn main() -> Result<()> {
     };
 
     let app_config = AppConfig {
-        app_fri_params,
+        app_fri_params: app_fri_params.into(),
         app_vm_config: Rv32ImConfig::default(),
         leaf_fri_params: leaf_fri_params.into(),
         compiler_options,
diff --git a/benchmarks/src/bin/regex.rs b/benchmarks/src/bin/regex.rs
index 59eaf604c7..369a3c2a69 100644
--- a/benchmarks/src/bin/regex.rs
+++ b/benchmarks/src/bin/regex.rs
@@ -39,7 +39,8 @@ fn main() -> Result<()> {
             .with_extension(Keccak256TranspilerExtension),
     )?;
     let app_config = AppConfig {
-        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup),
+        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup)
+            .into(),
         app_vm_config: Keccak256Rv32Config::default(),
         leaf_fri_params: FriParameters::standard_with_100_bits_conjectured_security(agg_log_blowup)
             .into(),
diff --git a/benchmarks/src/bin/revm_transfer.rs b/benchmarks/src/bin/revm_transfer.rs
index 10e69ba2c7..7c603bde3f 100644
--- a/benchmarks/src/bin/revm_transfer.rs
+++ b/benchmarks/src/bin/revm_transfer.rs
@@ -37,7 +37,8 @@ fn main() -> Result<()> {
             .with_extension(Rv32IoTranspilerExtension),
     )?;
     let app_config = AppConfig {
-        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup),
+        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(app_log_blowup)
+            .into(),
         app_vm_config: Keccak256Rv32Config::default(),
         leaf_fri_params: FriParameters::standard_with_100_bits_conjectured_security(1).into(),
         compiler_options: CompilerOptions::default().with_cycle_tracker(),
diff --git a/benchmarks/src/bin/rkyv.rs b/benchmarks/src/bin/rkyv.rs
index ff54f894a1..ab4dae2e0e 100644
--- a/benchmarks/src/bin/rkyv.rs
+++ b/benchmarks/src/bin/rkyv.rs
@@ -30,7 +30,7 @@ fn main() -> Result<()> {
     };
 
     let app_config = AppConfig {
-        app_fri_params,
+        app_fri_params: app_fri_params.into(),
         app_vm_config: Rv32ImConfig::default(),
         leaf_fri_params: leaf_fri_params.into(),
         compiler_options,
diff --git a/benchmarks/src/bin/verify_fibair.rs b/benchmarks/src/bin/verify_fibair.rs
index 2b53ee4e22..3bc0ba335f 100644
--- a/benchmarks/src/bin/verify_fibair.rs
+++ b/benchmarks/src/bin/verify_fibair.rs
@@ -48,7 +48,7 @@ fn main() -> Result<()> {
             ..Default::default()
         };
         let app_config = AppConfig {
-            app_fri_params: leaf_fri_params,
+            app_fri_params: leaf_fri_params.into(),
             app_vm_config,
             leaf_fri_params: leaf_fri_params.into(),
             compiler_options,
diff --git a/benchmarks/src/utils.rs b/benchmarks/src/utils.rs
index 2751323916..aad24b8af8 100644
--- a/benchmarks/src/utils.rs
+++ b/benchmarks/src/utils.rs
@@ -89,8 +89,8 @@ where
     VC::Executor: Chip<SC>,
     VC::Periphery: Chip<SC>,
 {
-    counter!("fri.log_blowup").absolute(app_config.app_fri_params.log_blowup as u64);
-    let engine = BabyBearPoseidon2Engine::new(app_config.app_fri_params);
+    counter!("fri.log_blowup").absolute(app_config.app_fri_params.fri_params.log_blowup as u64);
+    let engine = BabyBearPoseidon2Engine::new(app_config.app_fri_params.fri_params);
     let vm = VirtualMachine::new(engine, app_config.app_vm_config.clone());
     // 1. Generate proving key from config.
     let app_pk = time(gauge!("keygen_time_ms"), || {
@@ -98,7 +98,7 @@ where
     });
     // 2. Commit to the exe by generating cached trace for program.
     let committed_exe = time(gauge!("commit_exe_time_ms"), || {
-        commit_app_exe(app_config.app_fri_params, exe)
+        commit_app_exe(app_config.app_fri_params.fri_params, exe)
     });
     // 3. Executes runtime once with full metric collection for flamegraphs (slow).
     // 4. Executes runtime again without metric collection and generate trace.
diff --git a/book/book.toml b/book/book.toml
index 44d812cc99..c1c5a8953f 100644
--- a/book/book.toml
+++ b/book/book.toml
@@ -7,3 +7,4 @@ title = "OpenVM Book"
 
 [output.html]
 site-url = "https://book.openvm.dev/"
+mathjax-support = true
\ No newline at end of file
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
index e284f9e982..939a8948eb 100644
--- a/book/src/SUMMARY.md
+++ b/book/src/SUMMARY.md
@@ -19,9 +19,14 @@
 
 # Using Extensions
 
-- [Customizable Extensions](./using-extensions/customizable-extensions.md)
-- [Elliptic Curve Pairing](./using-extensions/pairing.md)
+- [Overview](./custom-extensions/overview.md)
+- [Keccak](./custom-extensions/keccak.md)
+- [Big Integer](./custom-extensions/bigint.md)
+- [Algebra](./custom-extensions/algebra.md)
+- [Elliptic Curve Cryptography](./custom-extensions/ecc.md)
+- [Elliptic Curve Pairing](./custom-extensions/pairing.md)
 
 # Advanced Usage
 
 - [Overview](./advanced-usage/overview.md)
+- [Testing the program](./advanced-usage/testing-program.md)
diff --git a/book/src/advanced-usage/testing-program.md b/book/src/advanced-usage/testing-program.md
index 36839e91f5..1db3d71521 100644
--- a/book/src/advanced-usage/testing-program.md
+++ b/book/src/advanced-usage/testing-program.md
@@ -1,4 +1,3 @@
-
 ## Testing the program
 
 ### Running on the host machine
diff --git a/book/src/custom-extensions/algebra.md b/book/src/custom-extensions/algebra.md
new file mode 100644
index 0000000000..47585c97be
--- /dev/null
+++ b/book/src/custom-extensions/algebra.md
@@ -0,0 +1,156 @@
+# OpenVM Algebra
+
+The OpenVM Algebra extension provides tools to create and manipulate modular arithmetic structures and their complex extensions. For example, if \\(p\\) is prime, OpenVM Algebra can handle modular arithmetic in \\(\mathbb{F}_p\\)​ and its quadratic extension fields \\(\mathbb{F}_p[x]/(x^2 + 1)\\).
+
+The functional part is provided by the `openvm-algebra-guest` crate, which is a guest library that can be used in any OpenVM program. The macros for creating corresponding structs are in the `openvm-algebra-moduli-setup` and `openvm-algebra-complex-macros` crates.
+
+## Available traits and methods
+
+- `IntMod` trait:
+    Defines the type `Repr` and constants `MODULUS`, `NUM_LIMBS`, `ZERO`, and `ONE`. It also provides basic methods for constructing a modular arithmetic object and performing arithmetic operations.
+    - `Repr` typically is `[u8; NUM_LIMBS]`, representing the number's underlying storage.
+    - `MODULUS` is the compile-time known modulus.
+    - `ZERO` and `ONE` represent the additive and multiplicative identities, respectively.
+    - Constructors include `from_repr`, `from_le_bytes`, `from_be_bytes`, `from_u8`, `from_u32`, and `from_u64`.
+
+- `Field` trait:
+    Provides constants `ZERO` and `ONE` and methods for basic arithmetic operations within a field.
+
+## Modular arithmetic
+
+To [leverage](./overview.md) compile-time known moduli for performance, you declare, initialize, and then set up the arithmetic structures:
+
+1. **Declare**: Use the `moduli_declare!` macro to define a modular arithmetic struct. This can be done multiple times in various crates or modules:
+```rust
+moduli_declare! {
+    Bls12_381Fp { modulus = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" },
+    Bn254Fp { modulus = "21888242871839275222246405745257275088696311157297823662689037894645226208583" },
+}
+```
+
+This creates `Bls12_381Fp` and `Bn254Fp` structs, each implementing the `IntMod` trait. The modulus parameter must be a string literal in decimal or hexadecimal format.
+
+2. **Init**: Use the `moduli_init!` macro exactly once in the final binary:
+
+```rust
+moduli_init! {
+    "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
+    "21888242871839275222246405745257275088696311157297823662689037894645226208583"
+}
+```
+
+This step enumerates the declared moduli (e.g., `0` for the first one, `1` for the second one) and sets up internal linkage so the compiler can generate the appropriate RISC-V instructions associated with each modulus.
+
+3. **Setup**: At runtime, before performing arithmetic, a setup instruction must be sent to ensure security and correctness. For the \\(i\\)-th modulus, you call `setup_<i>()` (e.g., `setup_0()` or `setup_1()`). Alternatively, `setup_all_moduli()` can be used to handle all declared moduli.
+
+**Summary**:
+- `moduli_declare!`: Declares modular arithmetic structures and can be done multiple times.
+- `moduli_init!`: Called once in the final binary to assign and lock in the moduli.
+- `setup_<i>()`/`setup_all_moduli()`: Ensures at runtime that the correct modulus is in use, providing a security check and finalizing the environment for safe arithmetic operations.
+
+## Complex field extension
+
+Complex extensions, such as \\(\mathbb{F}_p[x]/(x^2 + 1)\\), are defined similarly using `complex_declare!` and `complex_init!`:
+
+1. **Declare**:
+
+```rust
+complex_declare! {
+    Bn254Fp2 { mod_type = Bn254Fp }
+}
+```
+
+This creates a `Bn254Fp2` struct, representing a complex extension field. The `mod_type` must implement `IntMod`.
+
+2. **Init**: Called once, after `moduli_init!`, to enumerate these extensions and generate corresponding instructions:
+
+```rust
+complex_init! {
+    Bn254Fp2 { mod_idx = 0 },
+}
+```
+
+Note that you need to use the same type name in `complex_declare!` and `complex_init!`. For example, the following code will **fail** to compile:
+
+```rust
+// moduli related macros...
+
+complex_declare! {
+    Bn254Fp2 { mod_type = Bn254Fp },
+}
+
+pub type Fp2 = Bn254Fp2;
+
+complex_init! {
+    Fp2 { mod_idx = 0 },
+}
+```
+
+Here, `mod_idx` refers to the index of the underlying modulus as initialized by `moduli_init!`
+
+3. **Setup**: Similar to moduli, call `setup_complex_<i>()` or `setup_all_complex_extensions()` at runtime to secure the environment.
+
+### Example program
+
+Here is a toy example using both the modular arithmetic and complex field extension capabilities:
+```rust
+#![cfg_attr(not(feature = "std"), no_main)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+use openvm_algebra_guest::IntMod;
+
+openvm::entry!(main);
+
+// This macro will create two structs, `Mod1` and `Mod2`,
+// one for arithmetic modulo 998244353, and the other for arithmetic modulo 1000000007.
+openvm_algebra_moduli_setup::moduli_declare! {
+    Mod1 { modulus = "998244353" },
+    Mod2 { modulus = "1000000007" }
+}
+
+// This macro will initialize the moduli.
+// Now, `Mod1` is the "zeroth" modular struct, and `Mod2` is the "first" one.
+openvm_algebra_moduli_setup::moduli_init! {
+    "998244353", "1000000007"
+}
+
+// This macro will create two structs, `Complex1` and `Complex2`,
+// one for arithmetic in the field $\mathbb{F}_{998244353}[x]/(x^2 + 1)$,
+// and the other for arithmetic in the field $\mathbb{F}_{1000000007}[x]/(x^2 + 1)$.
+openvm_algebra_complex_macros::complex_declare! {
+    Complex1 { mod_type = Mod1 },
+    Complex2 { mod_type = Mod2 },
+}
+
+// The order of these structs does not matter,
+// given that we specify the `mod_idx` parameters properly.
+openvm_algebra_complex_macros::complex_init! {
+    Complex2 { mod_idx = 1 }, Complex1 { mod_idx = 0 },
+}
+
+pub fn main() {
+    // Since we only use an arithmetic operation with `Mod1` and not `Mod2`,
+    // we only need to call `setup_0()` here.
+    setup_0();
+    setup_all_complex_extensions();
+    let a = Complex1::new(Mod1::ZERO, Mod1::from_u32(0x3b8) * Mod1::from_u32(0x100000)); // a = -i in the corresponding field
+    let b = Complex2::new(Mod2::ZERO, Mod2::from_u32(1000000006)); // b = -i in the corresponding field
+    assert_eq!(a.clone() * &a * &a * &a * &a, a); // a^5 = a
+    assert_eq!(b.clone() * &b * &b * &b * &b, b); // b^5 = b
+    // Note that these assertions would fail, have we provided the `mod_idx` parameters wrongly.
+}
+```
+
+### Config parameters
+
+For the guest program to build successfully, all used moduli must be declared in the `.toml` config file in the following format:
+
+```toml
+[app_vm_config.modular]
+supported_modulus = ["115792089237316195423570985008687907853269984665640564039457584007908834671663"]
+
+[app_vm_config.fp2]
+supported_modulus = ["115792089237316195423570985008687907853269984665640564039457584007908834671663"]
+```
+
+The `supported_modulus` parameter is a list of moduli that the guest program will use. They must be provided in decimal format in the `.toml` file.
diff --git a/book/src/custom-extensions/bigint.md b/book/src/custom-extensions/bigint.md
new file mode 100644
index 0000000000..8b0cfd53a5
--- /dev/null
+++ b/book/src/custom-extensions/bigint.md
@@ -0,0 +1,197 @@
+# OpenVM BigInt
+
+The OpenVM BigInt extension (aka `Int256`) provides two structs: `U256` and `I256`. These structs can be used to perform 256 bit arithmetic operations. The functional part is provided by the `openvm-bigint-guest` crate, which is a guest library that can be used in any OpenVM program.
+
+## `U256`
+
+The `U256` struct is a 256-bit unsigned integer type. 
+
+### Constants
+
+The `U256` struct has the following constants:
+
+- `MAX`: The maximum value of a `U256`.
+- `MIN`: The minimum value of a `U256`.
+- `ZERO`: The zero constant.
+
+### Constructors
+
+The `U256` struct implements the following constructors: `from_u8`, `from_u32`, and `from_u64`.
+
+### Binary Operations
+
+The `U256` struct implements the following binary operations: `addition`, `subtraction`, `multiplication`, `bitwise and`, `bitwise or`, `bitwise xor`, `bitwise shift right`, and `bitwise shift left`. All operations will wrap the result when the result is outside the range of the `U256` type.
+
+All of the operations can be used in 6 different ways:
+`U256 op U256` or `U256 op &U256` or `&U256 op U256` or `&U256 op &U256` or `U256 op= U256` or `&U256 op= U256`.
+
+### Other
+
+When using the `U256` struct with `target_os = "zkvm"`, the struct utilizes efficient implementations of comparison operators as well as the `clone` method.
+
+### Example matrix multiplication using `U256`
+
+See the full example [here](https://github.com/openvm-org/openvm/blob/main/crates/toolchain/tests/programs/examples/matrix-power.rs).
+
+```rust
+#![cfg_attr(not(feature = "std"), no_main)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+openvm::entry!(main);
+use core::array;
+use openvm_bigint_guest::U256;
+
+const N: usize = 16;
+type Matrix = [[U256; N]; N];
+
+pub fn get_matrix(val: u8) -> Matrix {
+    array::from_fn(|_| array::from_fn(|_| U256::from_u8(val)))
+}
+
+pub fn mult(a: &Matrix, b: &Matrix) -> Matrix {
+    let mut c = get_matrix(0);
+    for i in 0..N {
+        for j in 0..N {
+            for k in 0..N {
+                c[i][j] += &a[i][k] * &b[k][j];
+            }
+        }
+    }
+    c
+}
+
+pub fn get_identity_matrix() -> Matrix {
+    let mut res = get_matrix(0);
+    for i in 0..N {
+        res[i][i] = U256::from_u8(1);
+    }
+    res
+}
+
+pub fn main() {
+    let a: Matrix = get_identity_matrix();
+    let b: Matrix = get_matrix(28);
+    let c: Matrix = mult(&a, &b);
+    assert_eq!(c, b);
+}
+```
+
+## `I256`
+
+The `I256` struct is a 256-bit signed integer type. The `I256` struct is very similar to the `U256` struct.
+
+### Constants
+
+The `I256` struct has the following constants:
+
+- `MAX`: The maximum value of a `I256`.
+- `MIN`: The minimum value of a `I256`.
+- `ZERO`: The zero constant.
+
+### Binary Operations
+
+The `I256` struct implements the following binary operations: `addition`, `subtraction`, `multiplication`, `bitwise and`, `bitwise or`, `bitwise xor`, `bitwise shift right`, and `bitwise shift left`. All operations will wrap the result when the result is outside the range of the `I256` type. Note that unlike the `U256`, when performing the shift right operation `I256` will perform an arithmetic shift right (i.e. sign extends the result).
+
+All of the operations can be used in 6 different ways:
+`I256 op I256` or `I256 op &I256` or `&I256 op I256` or `&I256 op &I256` or `I256 op= I256` or `&I256 op= I256`.
+
+### Constructors
+
+The `I256` struct implements the following constructors: `from_i8`, `from_i32`, and `from_i64`.
+
+### Other
+
+When using the `I256` struct with `target_os = "zkvm"`, the struct utilizes efficient implementations of comparison operators as well as the `clone` method.
+
+### Example matrix multiplication using `I256`
+
+See the full example [here](https://github.com/openvm-org/openvm/blob/main/crates/toolchain/tests/programs/examples/signed-matrix-power.rs).
+
+```rust
+#![cfg_attr(not(feature = "std"), no_main)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+openvm::entry!(main);
+use core::array;
+use openvm_bigint_guest::I256;
+
+const N: usize = 16;
+type Matrix = [[I256; N]; N];
+
+pub fn get_matrix(val: i32) -> Matrix {
+    array::from_fn(|_| array::from_fn(|_| I256::from_i32(val)))
+}
+
+pub fn mult(a: &Matrix, b: &Matrix) -> Matrix {
+    let mut c = get_matrix(0);
+    for i in 0..N {
+        for j in 0..N {
+            for k in 0..N {
+                c[i][j] += &a[i][k] * &b[k][j];
+            }
+        }
+    }
+    c
+}
+
+pub fn get_identity_matrix() -> Matrix {
+    let mut res = get_matrix(0);
+    for i in 0..N {
+        res[i][i] = I256::from_i32(1);
+    }
+    res
+}
+
+pub fn main() {
+    let a: Matrix = get_identity_matrix();
+    let b: Matrix = get_matrix(-28);
+    let c: Matrix = mult(&a, &b);
+    assert_eq!(c, b);
+}
+```
+
+## External Functions
+
+The Bigint Guest extension provides another way to use the native implementation. It provides external functions that are meant to be linked to other external libraries. The external libraries can use these functions as a hook for the 256 bit integer native implementations. Enabled only when the `target_os = "zkvm"`. All of the functions are defined as `unsafe extern "C" fn`. Also, note that you must enable the feature `export-intrinsics` to make them globally linkable.
+
+- `zkvm_u256_wrapping_add_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a + b`.
+- `zkvm_u256_wrapping_sub_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a - b`.
+- `zkvm_u256_wrapping_mul_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a * b`.
+- `zkvm_u256_bitxor_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a ^ b`.
+- `zkvm_u256_bitand_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a & b`.
+- `zkvm_u256_bitor_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a | b`.
+- `zkvm_u256_wrapping_shl_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a << b`.
+- `zkvm_u256_wrapping_shr_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a >> b`.
+- `zkvm_u256_arithmetic_shr_impl(result: *mut u8, a: *const u8, b: *const u8)`: takes in a pointer to the result, and two pointers to the inputs. `result = a.arithmetic_shr(b)`.
+- `zkvm_u256_eq_impl(a: *const u8, b: *const u8) -> bool`: takes in two pointers to the inputs. Returns `true` if `a == b`, otherwise `false`.
+- `zkvm_u256_cmp_impl(a: *const u8, b: *const u8) -> Ordering`: takes in two pointers to the inputs. Returns the ordering of `a` and `b`.
+- `zkvm_u256_clone_impl(result: *mut u8, a: *const u8)`: takes in a pointer to the result buffer, and a pointer to the input. `result = a`.
+
+And in the external library, you can do the following:
+
+```rust
+extern "C" {
+    fn zkvm_u256_wrapping_add_impl(result: *mut u8, a: *const u8, b: *const u8);
+}
+
+fn wrapping_add(a: &Custom_U256, b: &Custom_U256) -> Custom_U256 {
+    #[cfg(target_os = "zkvm")] {
+        let mut result: MaybeUninit<Custom_U256> = MaybeUninit::uninit();
+        unsafe {
+            zkvm_u256_wrapping_add_impl(result.as_mut_ptr() as *mut u8, a as *const u8, b as *const u8);
+        }
+        unsafe { result.assume_init() }
+    }
+    #[cfg(not(target_os = "zkvm"))] {
+        // Regular wrapping add implementation
+    }
+}
+```
+
+### Config parameters
+
+For the guest program to build successfully add the following to your `.toml` file:
+
+```toml
+[app_vm_config.bigint]
+```
\ No newline at end of file
diff --git a/book/src/custom-extensions/ecc.md b/book/src/custom-extensions/ecc.md
new file mode 100644
index 0000000000..9436d6b4a8
--- /dev/null
+++ b/book/src/custom-extensions/ecc.md
@@ -0,0 +1,115 @@
+# OpenVM ECC
+
+The OpenVM Elliptic Curve Cryptography Extension provides support for elliptic curve operations through the `openvm-ecc-guest` crate.
+
+## Available traits and methods
+
+- `Group` trait:
+  This represents an element of a [group](<https://en.wikipedia.org/wiki/Group_(mathematics)>) where the operation is addition. Therefore the trait includes functions for `add`, `sub`, and `double`.
+
+  - `IDENTITY` is the identity element of the group.
+
+- `CyclicGroup` trait:
+  It's a group that has a generator, so it defines `GENERATOR` and `NEG_GENERATOR`.
+
+- `WeierstrassPoint` trait:
+  It represents an affine point on a Weierstrass elliptic curve and it extends `Group`.
+
+  - `Coordinate` type is the type of the coordinates of the point, and it implements `IntMod`.
+  - `x()`, `y()` are used to get the affine coordinates
+  - `from_xy` is a constructor for the point, which checks if the point is either identity or on the affine curve.
+  - The point supports elliptic curve operations through intrinsic functions `add_ne_nonidentity` and `double_nonidentity`.
+  - `decompress`: Sometimes an elliptic curve point is compressed and represented by its `x` coordinate and the odd/even parity of the `y` coordinate. `decompress` is used to decompress the point back to `(x, y)`.
+
+- `msm`: for multi-scalar multiplication.
+
+- `ecdsa`: for doing ECDSA signature verification and public key recovery from signature.
+
+## Macros
+
+For elliptic curve cryptography, the `openvm-ecc-guest` crate provides macros similar to those in [`openvm-algebra-guest`](./algebra.md):
+
+1. **Declare**: Use `sw_declare!` to define elliptic curves over the previously declared moduli. For example:
+
+```rust
+sw_declare! {
+    Bls12_381G1Affine { mod_type = Bls12_381Fp, b = BLS12_381_B },
+    Bn254G1Affine { mod_type = Bn254Fp, b = BN254_B },
+}
+```
+
+Each declared curve must specify the `mod_type` (implementing `IntMod`) and a constant `b` for the Weierstrass curve equation \\(y^2 = x^3 + b\\).
+This creates `Bls12_381G1Affine` and `Bn254G1Affine` structs which implement the `Group` and `WeierstrassPoint` traits. The underlying memory layout of the structs uses the memory layout of the `Bls12_381Fp` and `Bn254Fp` structs, respectively.
+
+2. **Init**: Called once, it enumerates these curves and allows the compiler to produce optimized instructions:
+
+```rust
+sw_init! {
+    Bls12_381Fp, Bn254Fp,
+}
+```
+
+3. **Setup**: Similar to the moduli and complex extensions, runtime setup instructions ensure that the correct curve parameters are being used, guaranteeing secure operation.
+
+**Summary**:
+
+- `sw_declare!`: Declares elliptic curve structures.
+- `sw_init!`: Initializes them once, linking them to the underlying moduli.
+- `setup_sw_<i>()`/`setup_all_curves()`: Secures runtime correctness.
+
+To use elliptic curve operations on a struct defined with `sw_declare!`, it is expected that the struct for the curve's coordinate field was defined using `moduli_declare!`. In particular, the coordinate field needs to be initialized and set up as described in the [algebra extension](./algebra.md) chapter.
+
+For the basic operations provided by the `WeierstrassPoint` trait, the scalar field is not needed. For the ECDSA functions in the `ecdsa` module, the scalar field must also be declared, initialized, and set up.
+
+## Example program
+
+See a working example [here](https://github.com/openvm-org/openvm/blob/main/crates/toolchain/tests/programs/examples/ec.rs).
+
+To use the ECC extension, add the following dependencies to `Cargo.toml`:
+
+```toml
+openvm-algebra-guest = { git = "https://github.com/openvm-org/openvm.git" }
+openvm-ecc-guest = { git = "https://github.com/openvm-org/openvm.git", features = ["k256"] }
+```
+
+One can define their own ECC structs but we will use the Secp256k1 struct from `openvm-ecc-guest` and thus the `k256` feature should be enabled.
+
+```rust
+use openvm_ecc_guest::{
+    k256::{Secp256k1Coord, Secp256k1Point, Secp256k1Scalar},
+    Group, weierstrass::WeierstrassPoint,
+};
+
+openvm_algebra_guest::moduli_setup::moduli_init! {
+    "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F",
+    "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE BAAEDCE6 AF48A03B BFD25E8C D0364141"
+}
+
+openvm_ecc_guest::sw_setup::sw_init! {
+    Secp256k1Coord,
+}
+```
+
+We `moduli_init!` both the coordinate and scalar field because they were declared in the `k256` module, although we will not be using the scalar field below.
+
+With the above we can start doing elliptic curve operations like adding points:
+
+```rust
+pub fn main() {
+    setup_all_moduli();
+    setup_all_curves();
+    let x1 = Secp256k1Coord::from_u32(1);
+    let y1 = Secp256k1Coord::from_le_bytes(&hex!(
+        "EEA7767E580D75BC6FDD7F58D2A84C2614FB22586068DB63B346C6E60AF21842"
+    ));
+    let p1 = Secp256k1Point::from_xy_nonidentity(x1, y1).unwrap();
+
+    let x2 = Secp256k1Coord::from_u32(2);
+    let y2 = Secp256k1Coord::from_le_bytes(&hex!(
+        "D1A847A8F879E0AEE32544DA5BA0B3BD1703A1F52867A5601FF6454DD8180499"
+    ));
+    let p2 = Secp256k1Point::from_xy_nonidentity(x2, y2).unwrap();
+
+    let p3 = &p1 + &p2;
+}
+```
diff --git a/book/src/custom-extensions/keccak.md b/book/src/custom-extensions/keccak.md
new file mode 100644
index 0000000000..527ea6c25b
--- /dev/null
+++ b/book/src/custom-extensions/keccak.md
@@ -0,0 +1,68 @@
+# OpenVM Keccak256
+
+The OpenVm Keccak256 extension provides tools for using the Keccak-256 hash function. 
+The functional part is provided by the `openvm-keccak-guest` crate, which is a guest library that can be used in any OpenVM program. 
+
+## Functions for guest code
+
+The OpenVM Keccak256 Guest extension provides two functions for using in your guest code:
+
+- `keccak256(input: &[u8]) -> [u8; 32]`: Computes the Keccak-256 hash of the input data and returns it as an array of 32 bytes.
+- `set_keccak256(input: &[u8], output: &mut [u8; 32])`: Sets the output to the Keccak-256 hash of the input data into the provided output buffer.
+
+See the full example [here](https://github.com/openvm-org/openvm/blob/main/crates/toolchain/tests/programs/examples/keccak.rs).
+
+### Example:
+```rust
+use openvm_keccak256_guest::keccak256;
+
+pub fn main() {
+    let test_vectors = [
+        ("", "C5D2460186F7233C927E7DB2DCC703C0E500B653CA82273B7BFAD8045D85A470"),
+        ("CC", "EEAD6DBFC7340A56CAEDC044696A168870549A6A7F6F56961E84A54BD9970B8A"),
+    ];
+    for (input, expected_output) in test_vectors.iter() {
+        let input = Vec::from_hex(input).unwrap();
+        let expected_output = Vec::from_hex(expected_output).unwrap();
+        let output = keccak256(&black_box(input));
+        if output != *expected_output {
+            panic!();
+        }
+    }
+}
+```
+
+## Native Keccak256
+
+Keccak guest extension also provides another way to use the native Keccak-256 implementation. It provides a function that is meant to be linked to other external libraries. The external libraries can use this function as a hook for the Keccak-256 native implementation. Enabled only when the target is `zkvm`.
+
+- `native_keccak256(input: *const u8, len: usize, output: *mut u8)`: This function has `C` ABI. It takes in a pointer to the input, the length of the input, and a pointer to the output buffer.
+
+In the external library, you can do the following:
+
+```rust
+extern "C" {
+    fn native_keccak256(input: *const u8, len: usize, output: *mut u8);
+}
+
+fn keccak256(input: &[u8]) -> [u8; 32] {
+    #[cfg(target_os = "zkvm")] {
+    let mut output = [0u8; 32];
+        unsafe {
+            native_keccak256(input.as_ptr(), input.len(), output.as_mut_ptr() as *mut u8);
+        }
+        output
+    }
+    #[cfg(not(target_os = "zkvm"))] {
+        // Regular Keccak-256 implementation
+    }
+}
+```
+
+### Config parameters
+
+For the guest program to build successfully add the following to your `.toml` file:
+
+```toml
+[app_vm_config.keccak256]
+```
\ No newline at end of file
diff --git a/book/src/custom-extensions/overview.md b/book/src/custom-extensions/overview.md
new file mode 100644
index 0000000000..2b5c427d0c
--- /dev/null
+++ b/book/src/custom-extensions/overview.md
@@ -0,0 +1,23 @@
+# Using Existing Extensions
+
+You can seamlessly integrate certain performance-optimized extensions maintained by the OpenVM team to enhance your arithmetic operations and cryptographic computations.
+
+In this chapter, we will explain how to use the following existing extensions:
+
+- [`openvm-keccak-guest`](./keccak.md) - Keccak256 hash function.
+- [`openvm-bigint-guest`](./bigint.md) - Big integer arithmetic for 256-bit signed and unsigned integers.
+- [`openvm-algebra-guest`](./algebra.md) - Modular arithmetic and complex field extensions.
+- [`openvm-ecc-guest`](./ecc.md) - Elliptic curve cryptography.
+- [`openvm-pairing-guest`](./pairing.md) - Elliptic curve optimal Ate pairings.
+
+Some extensions such as `openvm-keccak-guest` and `openvm-bigint-guest` can be enabled without specifying any additional configuration.
+
+On the other hand certain arithmetic operations, particularly modular arithmetic, can be optimized significantly when the modulus is known at compile time. This approach requires a framework to inform the compiler about all the moduli and associated arithmetic structures we intend to use. To achieve this, three steps are involved:
+
+1. **Declare**: Introduce a modular arithmetic or related structure, along with its modulus and functionality. This can be done in any library or binary file.
+2. **Init**: Performed exactly once in the final binary. It aggregates all previously declared structures, assigns them stable indices, and sets up linkage so that they can be referenced in generated code.
+3. **Setup**: A one-time runtime procedure for security. This ensures that the compiled code matches the virtual machine’s expectations and that each instruction set is tied to the correct modulus or extension.
+
+These steps ensure both performance and security: performance because the modulus is known at compile time, and security because runtime checks confirm that the correct structures have been initialized.
+
+Our design for the configuration procedure above was inspired by the [EVMMAX proposal](https://github.com/jwasinger/EIPs/blob/evmmax-2/EIPS/eip-6601.md).
diff --git a/book/src/using-extensions/pairing.md b/book/src/custom-extensions/pairing.md
similarity index 98%
rename from book/src/using-extensions/pairing.md
rename to book/src/custom-extensions/pairing.md
index c20fd94de6..2d429da821 100644
--- a/book/src/using-extensions/pairing.md
+++ b/book/src/custom-extensions/pairing.md
@@ -26,7 +26,7 @@ use openvm_algebra_guest::IntMod;
 use openvm::io::read;
 ```
 
-Additionally, we'll need to initialize our moduli and `Fp2` struct via the following macros. For a more in-depth description of these macros, please see the [Customizable Extensions](./customizable-extensions.md) section.
+Additionally, we'll need to initialize our moduli and `Fp2` struct via the following macros. For a more in-depth description of these macros, please see the [OpenVM Algebra](./algebra.md) section.
 
 ```rust
 // These correspond to the BLS12-381 coordinate and scalar moduli, respectively
diff --git a/book/src/getting-started/quickstart.md b/book/src/getting-started/quickstart.md
index 03825f536f..c40f274cd1 100644
--- a/book/src/getting-started/quickstart.md
+++ b/book/src/getting-started/quickstart.md
@@ -10,7 +10,7 @@ First, create a new Rust project.
 cargo init fibonacci
 ```
 
-Since we are using some nightly features, we need to specify the Rust version. Create a `rust-toolchain.toml` file with the following content:
+Since we are using some nightly features, we need to specify the Rust version. Run `rustup component add rust-src --toolchain nightly-2024-10-30` and create a `rust-toolchain.toml` file with the following content:
 
 ```toml
 [toolchain]
diff --git a/book/src/introduction.md b/book/src/introduction.md
index 65a5d72df6..80d66d608f 100644
--- a/book/src/introduction.md
+++ b/book/src/introduction.md
@@ -1,17 +1,24 @@
-# Introduction
+# OpenVM
 
-OpenVM is ...
+_A modular toolkit for extensible zkVMs_
 
-... is _modular_, which means that its functionality is provided by several independent components. In particular, one can expand the functionality of OpenVM by adding new components.
+OpenVM is an open-source zero-knowledge virtual machine (zkVM) framework focused on modularity at every level of the stack. OpenVM is designed for customization and extensibility without sacrificing performance or maintainability.
 
-An _extension_ (we could also call it a _module_ but we prefer not to in order to avoid confusion with the concept of a _module_ in the Rust language) is a component that provides a specific functionality. It consists of the following parts:
+## Key Features
 
-- one
-- two
-- three
+- **Modular no-CPU Architecture**: Unlike traditional machine architectures, the OpenVM architecture has no central processing unit. This design choice allows for seamless integration of custom chips, **without forking or modifying the core architecture**.
 
-...
+- **Extensible Instruction Set**: The instruction set architecture (ISA) is designed to be extended with new custom instructions that integrate directly with the virtual machine.
 
-The next chapters are supposed to serve as a manual for using this modularity.
+- **Rust Frontend**: ISA extensions are directly accessible through a Rust frontend via [intrinsic functions](https://en.wikipedia.org/wiki/Intrinsic_function), providing a smooth developer experience.
 
-# In particular, Chapter 2 is for this, Chapter 3 is for that, et cetera.
+- **On-chain Verification**: Every VM made using the framework comes with out-of-the-box support for unbounded program proving with verification on Ethereum.
+
+## Using This Book
+
+The following chapters will guide you through:
+
+- [Getting started](./getting-started/install.md)
+- [Writing applications](./writing-apps/overview.md) in Rust targeting OpenVM and generating proofs.
+- [Using existing extensions](./custom-extensions/overview.md) to optimize your Rust programs.
+- How to add custom VM extensions
diff --git a/book/src/using-extensions/customizable-extensions.md b/book/src/using-extensions/customizable-extensions.md
deleted file mode 100644
index 1d69997c2a..0000000000
--- a/book/src/using-extensions/customizable-extensions.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# Using already existing extensions
-
-Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-
-## `openvm-algebra`
-
-This crate allows one to create and use structs for convenient modular arithmetic operations, and also for their complex extensions (for example, if $p$ is a prime number, `openvm-algebra` provides methods for modular arithmetic in the field $\mathbb{F}_p[x]/(x^2 + 1)$).
-
-To declare a modular arithmetic struct, one needs to use the `moduli_declare!` macro. A usage example is given below:
-
-```rust
-moduli_declare! {
-    Bls12_381Fp { modulus = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" },
-    Bn254Fp { modulus = "21888242871839275222246405745257275088696311157297823662689037894645226208583" },
-}
-```
-
-This creates two structs, `Bls12381_Fp` and `Bn254_Fp`, each representing the modular arithmetic class. These classes implement `Add`, `Sub` and other basic arithmetic operations; the underlying functions used for this are a part of the `IntMod` trait. The modulus for each struct is specified in the `modulus` parameter of the macro. It should be a string literal in either decimal or hexadecimal format (in the latter case, it must start with `0x`).
-
-The arithmetic operations for these classes, when compiling for the `zkvm` target, are converted into RISC-V asm instructions which are distinguished by the `funct7` field. The corresponding "distinguishers assignment" is happening when another macro is called:
-
-```rust
-moduli_init! {
-    "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
-    "21888242871839275222246405745257275088696311157297823662689037894645226208583"
-}
-```
-
-This macro **must be called exactly once** in the final executable program, and it must contain all the moduli that have ever been declared in the `moduli_declare!` macros across all the compilation units. It is possible to `declare` a number in decimal and `init` it in hexadecimal, and vice versa.
-
-When `moduli_init!` is called, the moduli in it are enumerated from `0`. For each chip that is used, the first instruction that this chip receives must be a `setup` instruction -- this adds a record to the trace that guarantees that the modulus this chip uses is exactly the one we `init`ed.
-
-To send a setup instruction for the $i$-th struct, one needs to call the `setup_<i>()` function (for instance, `setup_1()`). There is also a function `setup_all_moduli()` that calls all the available `setup` functions.
-
-To summarize:
-
-- `moduli_declare!` declares a struct for a modular arithmetic class. It can be called multiple times across the compilation units.
-- `moduli_init!` initializes the data required for transpiling the program into the RISC-V assembly. **Every modulus ever `declare`d in the program must be among the arguments of `moduli_init!`**.
-- `setup_<i>()` sends a setup instruction for the $i$-th struct. Here, **$i$-th struct is the one that corresponds to the $i$-th modulus in `moduli_init!`**. The order of `moduli_declare!` invocations or the arguments in them does not matter.
-- `setup_all_moduli()` sends setup instructions for all the structs.
-
-## `openvm-ecc`
-
-This crate allows one to create and use structs for elliptic curve cryptography. More specifically, it only supports curves where the defining equation is in short [Weierstrass curves](https://en.wikipedia.org/wiki/Weierstrass_form) (that is, `a = 0`).
-
-To declare an elliptic curve struct, one needs to use the `sw_declare!` macro. A usage example is given below:
-
-```rust
-sw_declare! {
-    Bls12_381G1Affine { mod_type = Bls12_381Fp, b = BLS12_381_B },
-    Bn254G1Affine { mod_type = Bn254Fp, b = BN254_B },
-}
-```
-
-Similar to the `moduli_declare!` macro, the `sw_declare!` macro creates a struct for an elliptic curve. The `mod_type` parameter specifies the type of the modulus for this curve, and the `b` parameter specifies the free coefficient of the curve equation; both of these parameters are required. The `mod_type` parameter must be a struct that implements the `IntMod` trait. The `b` parameter must be a constant.
-
-The arithmetic operations for these classes, when compiling for the `zkvm` target, are converted into RISC-V asm instructions which are distinguished by the `funct7` field. The corresponding "distinguishers assignment" is happening when another macro is called:
-
-```rust
-sw_init! {
-    Bls12_381Fp, Bn254Fp,
-}
-```
-
-Again, this macro **must be called exactly once** in the final executable program, and it must contain all the curves that have ever been declared in the `sw_declare!` macros across all the compilation units.
-
-When `sw_init!` is called, the curves in it are enumerated from `0`. For each chip that is used, the first instruction that this chip receives must be a `setup` instruction -- this adds a record to the trace that guarantees that the curve this chip uses is exactly the one we `init`ed.
-
-To send a setup instruction for the $i$-th struct, one needs to call the `setup_sw_<i>()` function (for instance, `setup_sw_1()`). There is also a function `setup_all_curves()` that calls all the available `setup` functions.
-
-To summarize:
-
-- `sw_declare!` declares a struct for an elliptic curve. It can be called multiple times across the compilation units.
-- `sw_init!` initializes the data required for transpiling the program into the RISC-V assembly. **Every curve ever `declare`d in the program must be among the arguments of `sw_init!`**.
-- `setup_sw_<i>()` sends a setup instruction for the $i$-th struct. Here, **$i$-th struct is the one that corresponds to the $i$-th curve in `sw_init!`**. The order of `sw_declare!` invocations or the arguments in them does not matter.
-- `setup_all_curves()` sends setup instructions for all the structs.
diff --git a/book/src/writing-apps/compile.md b/book/src/writing-apps/compile.md
index 9067cfda0f..f3b18f2570 100644
--- a/book/src/writing-apps/compile.md
+++ b/book/src/writing-apps/compile.md
@@ -1 +1,9 @@
 # Cross-Compilation
+
+First let's define some key terms used in cross-compilation:
+- **host** - the machine you're compiling and/or proving on. Note that one can compile and prove on different machines, but they are both called *host* as they are traditional machine architectures.
+- **guest** - the executable to be run in a different VM architecture (e.g. the OpenVM runtime, or Android app).
+
+There are multiple things happening in the `cargo openvm build` command as in the section [here](./write-program.md). In short, this command compiles on host to an executable for guest target.
+It first compiles the program normally on your *host* platform with RISC-V and then transpiles it to a different target. See here for some explanation of [cross-compilation](https://rust-lang.github.io/rustup/cross-compilation.html).
+Right now we use `riscv32im-risc0-zkvm-elf` target which is available in the [Rust toolchain](https://doc.rust-lang.org/rustc/platform-support/riscv32im-risc0-zkvm-elf.html), but we will contribute an OpenVM target to Rust in the future.
diff --git a/book/src/writing-apps/write-program.md b/book/src/writing-apps/write-program.md
index 66836a4ade..e6e9027ead 100644
--- a/book/src/writing-apps/write-program.md
+++ b/book/src/writing-apps/write-program.md
@@ -22,24 +22,63 @@ More examples of guest programs can be found in the [benchmarks/programs](https:
 
 Although it's usually ok to use std (like in quickstart), not all std functionalities are supported (e.g., randomness). There might be unexpected runtime errors if one uses std, so it is recommended you develop no_std libraries if possible to reduce surprises.
 Even without std, `assert!` and `panic!` can work as normal. To use `std` features, one should add the following to `Cargo.toml` feature sections:
+
 ```toml
 [features]
 std = ["openvm/std"]
-``` 
+```
 
 ### Building and running
 
-*TODO*: point to CLI installation instructions
+_TODO_: point to CLI installation instructions
+
+First we need to build the program targeting the OpenVM runtime, and that requires some configuration. Put the following in `openvm.toml`:
+
+```toml
+[app_fri_params]
+log_blowup = 2
+num_queries = 42
+proof_of_work_bits = 16
+
+[app_vm_config.io]
+[app_vm_config.rv32i]
+[app_vm_config.rv32m]
+range_tuple_checker_sizes = [256, 2048]
+```
+
+And run the following command to build the program:
+
+```bash
+cargo openvm build --transpile --transpiler-config openvm.toml --transpile-to outputs/fibonacci.vmexe
+```
+
+Next we can keygen the generate the proving and verifying keys:
+
+```bash
+cargo openvm keygen --config openvm.toml --output outputs/pk --vk-output outputs/vk
+```
+
+Now, to prove the program some input is needed. The input parameter is either a hex string or a file path. So for example if we want to compute the 10th fibonacci number, we can run:
+
+```bash
+cargo openvm prove app --app-pk outputs/pk --exe outputs/fibonacci.vmexe --input "0x000000000000000A" --output outputs/proof
+cargo openvm verify app --app-vk outputs/vk --proof outputs/proof
+```
+
+No errors should be returned, and the proof should be correctly verified.
 
 ## Handling I/O
 
-`openvm::io` provides a few functions to read and write data.
+The program can take input from stdin, with some functions provided by `openvm::io`.
+
+`openvm::io::read` takes from stdin and deserializes it into a generic type `T`, so one should specify the type when calling it:
 
-`read` takes from stdin the next vec and deserialize it into a generic type `T`, so one should specify the type when calling it:
 ```rust
 let n: u64 = read();
 ```
 
-`read_vec` will just read a vector and return `Vec<u8>`.
+`openvm::io::read_vec` will just read a vector and return `Vec<u8>`.
+
+`openvm::io::reveal` sends public values to the final proof (to be read by the smart contract).
 
-`reveal`
+For debugging purposes, `openvm::io::print` and `openvm::io::println` can be used normally, but `println!` will only work if `std` is enabled.
diff --git a/crates/circuits/primitives/derive/src/lib.rs b/crates/circuits/primitives/derive/src/lib.rs
index 1e1ba1aa52..18db7d2970 100644
--- a/crates/circuits/primitives/derive/src/lib.rs
+++ b/crates/circuits/primitives/derive/src/lib.rs
@@ -5,7 +5,7 @@ extern crate proc_macro;
 use itertools::multiunzip;
 use proc_macro::TokenStream;
 use quote::quote;
-use syn::{parse_macro_input, Data, DeriveInput, Fields, GenericParam};
+use syn::{parse_macro_input, Data, DeriveInput, Fields, GenericParam, LitStr, Meta};
 
 #[proc_macro_derive(AlignedBorrow)]
 pub fn aligned_borrow_derive(input: TokenStream) -> TokenStream {
@@ -72,8 +72,9 @@ pub fn aligned_borrow_derive(input: TokenStream) -> TokenStream {
     TokenStream::from(methods)
 }
 
-#[proc_macro_derive(Chip)]
+#[proc_macro_derive(Chip, attributes(chip))]
 pub fn chip_derive(input: TokenStream) -> TokenStream {
+    // Parse the attributes from the struct or enum
     let ast: syn::DeriveInput = syn::parse(input).unwrap();
 
     let name = &ast.ident;
@@ -160,6 +161,37 @@ pub fn chip_derive(input: TokenStream) -> TokenStream {
             let where_clause = new_generics.make_where_clause();
             where_clause.predicates.push(syn::parse_quote! { openvm_stark_backend::config::Domain<SC>: openvm_stark_backend::p3_commit::PolynomialSpace<Val = F>
             });
+            let attributes = ast.attrs.iter().find(|&attr| attr.path().is_ident("chip"));
+            if let Some(attr) = attributes {
+                let mut fail_flag = false;
+
+                match &attr.meta {
+                    Meta::List(meta_list) => {
+                        meta_list
+                            .parse_nested_meta(|meta| {
+                                if meta.path.is_ident("where") {
+                                    let value = meta.value()?; // this parses the `=`
+                                    let s: LitStr = value.parse()?;
+                                    let where_value = s.value();
+                                    where_clause.predicates.push(syn::parse_str(&where_value)?);
+                                } else {
+                                    fail_flag = true;
+                                }
+                                Ok(())
+                            })
+                            .unwrap();
+                    }
+                    _ => fail_flag = true,
+                }
+                if fail_flag {
+                    return syn::Error::new(
+                        name.span(),
+                        "Only `#[chip(where = ...)]` format is supported",
+                    )
+                    .to_compile_error()
+                    .into();
+                }
+            }
 
             quote! {
                 impl #impl_generics openvm_stark_backend::Chip<SC> for #name #ty_generics #where_clause {
diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml
index 759bb9624d..b780ff724f 100644
--- a/crates/cli/Cargo.toml
+++ b/crates/cli/Cargo.toml
@@ -47,6 +47,7 @@ prettytable-rs = "0.10"
 textwrap = "0.16.0"
 ctrlc = "3.4.2"
 toml = { workspace = true }
+num-bigint-dig = { workspace = true, features = ["serde"] }
 
 [dev-dependencies]
 openvm-cli-example-test = { path = "example" }
diff --git a/crates/cli/example/app_config.toml b/crates/cli/example/openvm.toml
similarity index 53%
rename from crates/cli/example/app_config.toml
rename to crates/cli/example/openvm.toml
index 1c74a3cd70..8ac6a25d95 100644
--- a/crates/cli/example/app_config.toml
+++ b/crates/cli/example/openvm.toml
@@ -1,8 +1,3 @@
-[app_fri_params]
-log_blowup = 2
-num_queries = 42
-proof_of_work_bits = 16
-
 [app_vm_config.rv32i]
 [app_vm_config.rv32m]
 range_tuple_checker_sizes = [256, 2048]
diff --git a/crates/cli/src/commands/bench.rs b/crates/cli/src/commands/bench.rs
index e7c7270383..5e4c4bd808 100644
--- a/crates/cli/src/commands/bench.rs
+++ b/crates/cli/src/commands/bench.rs
@@ -17,7 +17,7 @@ use openvm_stark_sdk::{
 };
 
 use super::build::{build, BuildArgs};
-use crate::util::{write_status, Input};
+use crate::util::{classical_exe_path, write_status, Input};
 
 #[derive(Clone, Parser)]
 #[command(name = "bench", about = "(default) Build and prove a program")]
@@ -43,10 +43,8 @@ impl BenchCmd {
         if self.profile {
             setup_tracing();
         }
-        let mut build_args = self.build_args.clone();
-        build_args.transpile = true;
         let elf_path = build(&self.build_args)?.unwrap();
-        let exe_path = build_args.exe_path(&elf_path);
+        let exe_path = classical_exe_path(&elf_path);
         let exe = read_exe_from_file(&exe_path)?;
 
         // TODO: read from openvm.toml
diff --git a/crates/cli/src/commands/build.rs b/crates/cli/src/commands/build.rs
index 30bce3f11f..95841b646f 100644
--- a/crates/cli/src/commands/build.rs
+++ b/crates/cli/src/commands/build.rs
@@ -1,22 +1,17 @@
-use std::{
-    fs::read,
-    path::{Path, PathBuf},
-};
+use std::{fs::read, path::PathBuf};
 
 use clap::Parser;
 use eyre::Result;
 use openvm_build::{
     build_guest_package, find_unique_executable, get_package, GuestOptions, TargetFilter,
 };
-use openvm_rv32im_transpiler::{Rv32ITranspilerExtension, Rv32MTranspilerExtension};
-use openvm_sdk::{
-    config::{AppConfig, SdkVmConfig},
-    fs::write_exe_to_file,
-    Sdk,
-};
-use openvm_transpiler::{elf::Elf, openvm_platform::memory::MEM_SIZE, transpiler::Transpiler};
+use openvm_sdk::{fs::write_exe_to_file, Sdk};
+use openvm_transpiler::{elf::Elf, openvm_platform::memory::MEM_SIZE};
 
-use crate::{default::DEFAULT_MANIFEST_DIR, util::read_to_struct_toml};
+use crate::{
+    default::{DEFAULT_APP_CONFIG_PATH, DEFAULT_APP_EXE_PATH, DEFAULT_MANIFEST_DIR},
+    util::read_config_toml_or_default,
+};
 
 #[derive(Parser)]
 #[command(name = "build", about = "Compile an OpenVM program")]
@@ -53,34 +48,28 @@ pub struct BuildArgs {
     #[arg(
         long,
         default_value = "false",
-        help = "Transpiles the program after building when set"
+        help = "Skips transpilation into exe when set"
     )]
-    pub transpile: bool,
+    pub no_transpile: bool,
 
     #[arg(
         long,
+        default_value = DEFAULT_APP_CONFIG_PATH,
         help = "Path to the SDK config .toml file that specifies the transpiler extensions"
     )]
-    pub transpiler_config: Option<PathBuf>,
+    pub config: PathBuf,
 
     #[arg(
         long,
-        help = "Output path for the transpiled program (default: <ELF base path>.vmexe)"
+        default_value = DEFAULT_APP_EXE_PATH,
+        help = "Output path for the transpiled program"
     )]
-    pub transpile_to: Option<PathBuf>,
+    pub exe_output: PathBuf,
 
     #[arg(long, default_value = "release", help = "Build profile")]
     pub profile: String,
 }
 
-impl BuildArgs {
-    pub fn exe_path(&self, elf_path: &Path) -> PathBuf {
-        self.transpile_to
-            .clone()
-            .unwrap_or_else(|| elf_path.with_extension("vmexe"))
-    }
-}
-
 #[derive(Clone, clap::Args)]
 #[group(required = false, multiple = false)]
 pub struct BinTypeFilter {
@@ -129,23 +118,17 @@ pub(crate) fn build(build_args: &BuildArgs) -> Result<Option<PathBuf>> {
         }
     };
 
-    if build_args.transpile {
+    if !build_args.no_transpile {
         let elf_path = elf_path?;
         println!("[openvm] Transpiling the package...");
-        let output_path = build_args.exe_path(&elf_path);
-        let transpiler = if let Some(transpiler_config) = build_args.transpiler_config.clone() {
-            let app_config: AppConfig<SdkVmConfig> = read_to_struct_toml(&transpiler_config)?;
-            app_config.app_vm_config.transpiler()
-        } else {
-            Transpiler::default()
-                .with_extension(Rv32ITranspilerExtension)
-                .with_extension(Rv32MTranspilerExtension)
-        };
+        let output_path = &build_args.exe_output;
+        let app_config = read_config_toml_or_default(&build_args.config)?;
+        let transpiler = app_config.app_vm_config.transpiler();
 
         let data = read(elf_path.clone())?;
         let elf = Elf::decode(&data, MEM_SIZE as u32)?;
         let exe = Sdk.transpile(elf, transpiler)?;
-        write_exe_to_file(exe, &output_path)?;
+        write_exe_to_file(exe, output_path)?;
 
         println!(
             "[openvm] Successfully transpiled to {}",
diff --git a/crates/cli/src/commands/keygen.rs b/crates/cli/src/commands/keygen.rs
index 2d1b2f67de..678ed8d5ae 100644
--- a/crates/cli/src/commands/keygen.rs
+++ b/crates/cli/src/commands/keygen.rs
@@ -3,20 +3,19 @@ use std::path::PathBuf;
 use clap::Parser;
 use eyre::Result;
 use openvm_sdk::{
-    config::{AppConfig, SdkVmConfig},
     fs::{write_app_pk_to_file, write_app_vk_to_file},
     Sdk,
 };
 
 use crate::{
-    default::{DEFAULT_APP_PK_PATH, DEFAULT_APP_VK_PATH},
-    util::read_to_struct_toml,
+    default::{DEFAULT_APP_CONFIG_PATH, DEFAULT_APP_PK_PATH, DEFAULT_APP_VK_PATH},
+    util::read_config_toml_or_default,
 };
 
 #[derive(Parser)]
 #[command(name = "keygen", about = "Generate an application proving key")]
 pub struct KeygenCmd {
-    #[clap(long, action, help = "Path to app config TOML file")]
+    #[clap(long, action, help = "Path to app config TOML file", default_value = DEFAULT_APP_CONFIG_PATH)]
     config: PathBuf,
 
     #[clap(
@@ -38,7 +37,7 @@ pub struct KeygenCmd {
 
 impl KeygenCmd {
     pub fn run(&self) -> Result<()> {
-        let app_config: AppConfig<SdkVmConfig> = read_to_struct_toml(&self.config)?;
+        let app_config = read_config_toml_or_default(&self.config)?;
         let app_pk = Sdk.app_keygen(app_config)?;
         write_app_vk_to_file(app_pk.get_vk(), &self.vk_output)?;
         write_app_pk_to_file(app_pk, &self.output)?;
diff --git a/crates/cli/src/commands/prove.rs b/crates/cli/src/commands/prove.rs
index 4ade86d73e..bf263d2c1a 100644
--- a/crates/cli/src/commands/prove.rs
+++ b/crates/cli/src/commands/prove.rs
@@ -16,8 +16,8 @@ use openvm_sdk::{
 
 use crate::{
     default::{
-        DEFAULT_AGG_PK_PATH, DEFAULT_APP_PK_PATH, DEFAULT_APP_PROOF_PATH, DEFAULT_EVM_PROOF_PATH,
-        DEFAULT_PARAMS_DIR,
+        DEFAULT_AGG_PK_PATH, DEFAULT_APP_EXE_PATH, DEFAULT_APP_PK_PATH, DEFAULT_APP_PROOF_PATH,
+        DEFAULT_EVM_PROOF_PATH, DEFAULT_PARAMS_DIR,
     },
     util::{read_to_stdin, Input},
 };
@@ -35,7 +35,7 @@ enum ProveSubCommand {
         #[clap(long, action, help = "Path to app proving key", default_value = DEFAULT_APP_PK_PATH)]
         app_pk: PathBuf,
 
-        #[clap(long, action, help = "Path to OpenVM executable")]
+        #[clap(long, action, help = "Path to OpenVM executable", default_value = DEFAULT_APP_EXE_PATH)]
         exe: PathBuf,
 
         #[clap(long, value_parser, help = "Input to OpenVM program")]
@@ -48,7 +48,7 @@ enum ProveSubCommand {
         #[clap(long, action, help = "Path to app proving key", default_value = DEFAULT_APP_PK_PATH)]
         app_pk: PathBuf,
 
-        #[clap(long, action, help = "Path to OpenVM executable")]
+        #[clap(long, action, help = "Path to OpenVM executable", default_value = DEFAULT_APP_EXE_PATH)]
         exe: PathBuf,
 
         #[clap(long, value_parser, help = "Input to OpenVM program")]
diff --git a/crates/cli/src/commands/run.rs b/crates/cli/src/commands/run.rs
index 0bd51f4b8a..e498e14714 100644
--- a/crates/cli/src/commands/run.rs
+++ b/crates/cli/src/commands/run.rs
@@ -2,21 +2,20 @@ use std::path::PathBuf;
 
 use clap::Parser;
 use eyre::Result;
-use openvm_sdk::{
-    config::{AppConfig, SdkVmConfig},
-    fs::read_exe_from_file,
-    Sdk,
-};
+use openvm_sdk::{fs::read_exe_from_file, Sdk};
 
-use crate::util::{read_to_stdin, read_to_struct_toml, Input};
+use crate::{
+    default::{DEFAULT_APP_CONFIG_PATH, DEFAULT_APP_EXE_PATH},
+    util::{read_config_toml_or_default, read_to_stdin, Input},
+};
 
 #[derive(Parser)]
 #[command(name = "run", about = "Run an OpenVM program")]
 pub struct RunCmd {
-    #[clap(long, action, help = "Path to OpenVM executable")]
+    #[clap(long, action, help = "Path to OpenVM executable", default_value = DEFAULT_APP_EXE_PATH)]
     exe: PathBuf,
 
-    #[clap(long, action, help = "Path to app config TOML file")]
+    #[clap(long, action, help = "Path to app config TOML file", default_value = DEFAULT_APP_CONFIG_PATH)]
     config: PathBuf,
 
     #[clap(long, value_parser, help = "Input to OpenVM program")]
@@ -26,7 +25,7 @@ pub struct RunCmd {
 impl RunCmd {
     pub fn run(&self) -> Result<()> {
         let exe = read_exe_from_file(&self.exe)?;
-        let app_config: AppConfig<SdkVmConfig> = read_to_struct_toml(&self.config)?;
+        let app_config = read_config_toml_or_default(&self.config)?;
         let output = Sdk.execute(exe, app_config.app_vm_config, read_to_stdin(&self.input)?)?;
         println!("Execution output: {:?}", output);
         Ok(())
diff --git a/crates/cli/src/default.rs b/crates/cli/src/default.rs
index 539152e4b8..d24bd0d043 100644
--- a/crates/cli/src/default.rs
+++ b/crates/cli/src/default.rs
@@ -1,10 +1,29 @@
+use openvm_sdk::config::{AppConfig, SdkVmConfig};
+use openvm_stark_sdk::config::FriParameters;
+
 pub const DEFAULT_MANIFEST_DIR: &str = ".";
 
 pub const DEFAULT_AGG_PK_PATH: &str = concat!(env!("HOME"), "/.openvm/agg.pk");
 pub const DEFAULT_VERIFIER_PATH: &str = concat!(env!("HOME"), "/.openvm/verifier.sol");
 pub const DEFAULT_PARAMS_DIR: &str = concat!(env!("HOME"), "/.openvm/params/");
 
+pub const DEFAULT_APP_CONFIG_PATH: &str = "./openvm.toml";
+pub const DEFAULT_APP_EXE_PATH: &str = "./openvm/app.vmexe";
 pub const DEFAULT_APP_PK_PATH: &str = "./openvm/app.pk";
 pub const DEFAULT_APP_VK_PATH: &str = "./openvm/app.vk";
 pub const DEFAULT_APP_PROOF_PATH: &str = "./openvm/app.proof";
 pub const DEFAULT_EVM_PROOF_PATH: &str = "./openvm/evm.proof";
+
+pub fn default_app_config() -> AppConfig<SdkVmConfig> {
+    AppConfig {
+        app_fri_params: FriParameters::standard_with_100_bits_conjectured_security(2).into(),
+        app_vm_config: SdkVmConfig::builder()
+            .system(Default::default())
+            .rv32i(Default::default())
+            .rv32m(Default::default())
+            .io(Default::default())
+            .build(),
+        leaf_fri_params: FriParameters::standard_with_100_bits_conjectured_security(2).into(),
+        compiler_options: Default::default(),
+    }
+}
diff --git a/crates/cli/src/util.rs b/crates/cli/src/util.rs
index 4e80152123..a80e9043e3 100644
--- a/crates/cli/src/util.rs
+++ b/crates/cli/src/util.rs
@@ -6,9 +6,14 @@ use std::{
 };
 
 use eyre::Result;
-use openvm_sdk::StdIn;
+use openvm_sdk::{
+    config::{AppConfig, SdkVmConfig},
+    StdIn,
+};
 use serde::de::DeserializeOwned;
 
+use crate::default::default_app_config;
+
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub(crate) enum Input {
@@ -56,6 +61,10 @@ pub(crate) fn write_status(style: &dyn Display, status: &str, msg: &str) {
     println!("{style}{status:>12}{style:#} {msg}");
 }
 
+pub(crate) fn classical_exe_path(elf_path: &Path) -> PathBuf {
+    elf_path.with_extension("vmexe")
+}
+
 pub(crate) fn read_to_struct_toml<T: DeserializeOwned>(path: &PathBuf) -> Result<T> {
     let toml = read_to_string(path.as_ref() as &Path)?;
     let ret = toml::from_str(&toml)?;
@@ -72,3 +81,15 @@ pub(crate) fn read_to_stdin(input: &Option<Input>) -> Result<StdIn> {
         None => Ok(StdIn::default()),
     }
 }
+
+pub(crate) fn read_config_toml_or_default(config: &PathBuf) -> Result<AppConfig<SdkVmConfig>> {
+    let mut app_config: Result<AppConfig<SdkVmConfig>> = read_to_struct_toml(config);
+    if app_config.is_err() {
+        println!(
+            "{:?} not found, using default application configuration",
+            config
+        );
+        app_config = Ok(default_app_config());
+    }
+    app_config
+}
diff --git a/crates/cli/tests/app_e2e.rs b/crates/cli/tests/app_e2e.rs
index c68a1b661b..a2af9cb08f 100644
--- a/crates/cli/tests/app_e2e.rs
+++ b/crates/cli/tests/app_e2e.rs
@@ -18,11 +18,10 @@ fn test_cli_app_e2e() -> Result<()> {
             "openvm",
             "build",
             "--manifest-dir",
-            "../sdk/example",
-            "--transpile",
-            "--transpiler-config",
-            "example/app_config.toml",
-            "--transpile-to",
+            "example",
+            "--config",
+            "example/openvm.toml",
+            "--exe-output",
             temp_exe.to_str().unwrap(),
         ],
     )?;
@@ -33,7 +32,7 @@ fn test_cli_app_e2e() -> Result<()> {
             "openvm",
             "keygen",
             "--config",
-            "example/app_config.toml",
+            "example/openvm.toml",
             "--output",
             temp_pk.to_str().unwrap(),
             "--vk-output",
@@ -49,7 +48,7 @@ fn test_cli_app_e2e() -> Result<()> {
             "--exe",
             temp_exe.to_str().unwrap(),
             "--config",
-            "example/app_config.toml",
+            "example/openvm.toml",
         ],
     )?;
 
@@ -86,55 +85,12 @@ fn test_cli_app_e2e() -> Result<()> {
 
 #[test]
 fn test_cli_app_e2e_default_paths() -> Result<()> {
-    let temp_dir = tempdir()?;
     run_cmd("cargo", &["install", "--path", ".", "--force"])?;
-    let temp_exe = temp_dir.path().join("example.vmexe");
-
-    run_cmd(
-        "cargo",
-        &[
-            "openvm",
-            "build",
-            "--manifest-dir",
-            "../sdk/example",
-            "--transpile",
-            "--transpiler-config",
-            "example/app_config.toml",
-            "--transpile-to",
-            temp_exe.to_str().unwrap(),
-        ],
-    )?;
-
-    run_cmd(
-        "cargo",
-        &["openvm", "keygen", "--config", "example/app_config.toml"],
-    )?;
-
-    run_cmd(
-        "cargo",
-        &[
-            "openvm",
-            "run",
-            "--exe",
-            temp_exe.to_str().unwrap(),
-            "--config",
-            "example/app_config.toml",
-        ],
-    )?;
-
-    run_cmd(
-        "cargo",
-        &[
-            "openvm",
-            "prove",
-            "app",
-            "--exe",
-            temp_exe.to_str().unwrap(),
-        ],
-    )?;
-
+    run_cmd("cargo", &["openvm", "build", "--manifest-dir", "example"])?;
+    run_cmd("cargo", &["openvm", "keygen"])?;
+    run_cmd("cargo", &["openvm", "run"])?;
+    run_cmd("cargo", &["openvm", "prove", "app"])?;
     run_cmd("cargo", &["openvm", "verify", "app"])?;
-
     Ok(())
 }
 
diff --git a/crates/sdk/src/config/mod.rs b/crates/sdk/src/config/mod.rs
index aa71384605..76012d66d8 100644
--- a/crates/sdk/src/config/mod.rs
+++ b/crates/sdk/src/config/mod.rs
@@ -6,13 +6,15 @@ use serde::{Deserialize, Serialize};
 mod global;
 pub use global::*;
 
+const DEFAULT_APP_BLOWUP: usize = 2;
 const DEFAULT_LEAF_BLOWUP: usize = 2;
 const DEFAULT_INTERNAL_BLOWUP: usize = 2;
 const DEFAULT_ROOT_BLOWUP: usize = 3;
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct AppConfig<VC> {
-    pub app_fri_params: FriParameters,
+    #[serde(default)]
+    pub app_fri_params: AppFriParams,
     pub app_vm_config: VC,
     #[serde(default)]
     pub leaf_fri_params: LeafFriParams,
@@ -50,7 +52,7 @@ pub struct Halo2Config {
 impl<VC> AppConfig<VC> {
     pub fn new(app_fri_params: FriParameters, app_vm_config: VC) -> Self {
         Self {
-            app_fri_params,
+            app_fri_params: AppFriParams::from(app_fri_params),
             app_vm_config,
             leaf_fri_params: Default::default(),
             compiler_options: Default::default(),
@@ -63,11 +65,9 @@ impl<VC> AppConfig<VC> {
         leaf_fri_params: FriParameters,
     ) -> Self {
         Self {
-            app_fri_params,
+            app_fri_params: AppFriParams::from(app_fri_params),
             app_vm_config,
-            leaf_fri_params: LeafFriParams {
-                fri_params: leaf_fri_params,
-            },
+            leaf_fri_params: LeafFriParams::from(leaf_fri_params),
             compiler_options: Default::default(),
         }
     }
@@ -103,6 +103,27 @@ impl Default for AggConfig {
     }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct AppFriParams {
+    pub fri_params: FriParameters,
+}
+
+impl Default for AppFriParams {
+    fn default() -> Self {
+        Self {
+            fri_params: FriParameters::standard_with_100_bits_conjectured_security(
+                DEFAULT_APP_BLOWUP,
+            ),
+        }
+    }
+}
+
+impl From<FriParameters> for AppFriParams {
+    fn from(fri_params: FriParameters) -> Self {
+        Self { fri_params }
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct LeafFriParams {
     pub fri_params: FriParameters,
diff --git a/crates/sdk/src/keygen/mod.rs b/crates/sdk/src/keygen/mod.rs
index 62bb8ac168..3bdc8567d7 100644
--- a/crates/sdk/src/keygen/mod.rs
+++ b/crates/sdk/src/keygen/mod.rs
@@ -83,14 +83,17 @@ where
     VC::Periphery: Chip<SC>,
 {
     pub fn keygen(config: AppConfig<VC>) -> Self {
-        let app_engine = BabyBearPoseidon2Engine::new(config.app_fri_params);
+        let app_engine = BabyBearPoseidon2Engine::new(config.app_fri_params.fri_params);
         let app_vm_pk = {
             let vm = VirtualMachine::new(app_engine, config.app_vm_config.clone());
             let vm_pk = vm.keygen();
-            assert!(vm_pk.max_constraint_degree <= config.app_fri_params.max_constraint_degree());
+            assert!(
+                vm_pk.max_constraint_degree
+                    <= config.app_fri_params.fri_params.max_constraint_degree()
+            );
             assert!(config.app_vm_config.system().continuation_enabled);
             VmProvingKey {
-                fri_params: config.app_fri_params,
+                fri_params: config.app_fri_params.fri_params,
                 vm_config: config.app_vm_config.clone(),
                 vm_pk,
             }
@@ -98,7 +101,7 @@ where
         let leaf_committed_exe = {
             let leaf_engine = BabyBearPoseidon2Engine::new(config.leaf_fri_params.fri_params);
             let leaf_program = LeafVmVerifierConfig {
-                app_fri_params: config.app_fri_params,
+                app_fri_params: config.app_fri_params.fri_params,
                 app_system_config: config.app_vm_config.system().clone(),
                 compiler_options: config.compiler_options,
             }
diff --git a/crates/sdk/tests/integration_test.rs b/crates/sdk/tests/integration_test.rs
index 7a57596fcf..c162a61517 100644
--- a/crates/sdk/tests/integration_test.rs
+++ b/crates/sdk/tests/integration_test.rs
@@ -112,7 +112,8 @@ fn agg_stark_config_for_test() -> AggStarkConfig {
 
 fn small_test_app_config(app_log_blowup: usize) -> AppConfig<NativeConfig> {
     AppConfig {
-        app_fri_params: standard_fri_params_with_100_bits_conjectured_security(app_log_blowup),
+        app_fri_params: standard_fri_params_with_100_bits_conjectured_security(app_log_blowup)
+            .into(),
         app_vm_config: NativeConfig::new(
             SystemConfig::default()
                 .with_max_segment_len(200)
diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml
index 1b0510e1a3..1287a34d82 100644
--- a/crates/vm/Cargo.toml
+++ b/crates/vm/Cargo.toml
@@ -38,6 +38,7 @@ derivative.workspace = true
 static_assertions.workspace = true
 async-trait.workspace = true
 getset.workspace = true
+rayon = { workspace = true, optional = true }
 
 [dev-dependencies]
 p3-dft = { workspace = true }
@@ -70,7 +71,7 @@ hex.workspace = true
 
 [features]
 default = ["parallel", "mimalloc"]
-parallel = ["openvm-stark-backend/parallel"]
+parallel = ["openvm-stark-backend/parallel", "dep:rayon"]
 test-utils = ["openvm-ecc-guest/halo2curves", "dep:openvm-stark-sdk"]
 bench-metrics = [
     "dep:metrics",
diff --git a/crates/vm/src/arch/testing/mod.rs b/crates/vm/src/arch/testing/mod.rs
index c960d5ab18..4ad33640ba 100644
--- a/crates/vm/src/arch/testing/mod.rs
+++ b/crates/vm/src/arch/testing/mod.rs
@@ -1,16 +1,12 @@
 use std::{cell::RefCell, rc::Rc, sync::Arc};
 
-use itertools::izip;
 use openvm_circuit_primitives::var_range::{VariableRangeCheckerBus, VariableRangeCheckerChip};
 use openvm_instructions::instruction::Instruction;
 use openvm_stark_backend::{
     config::{StarkGenericConfig, Val},
     engine::VerificationData,
     p3_field::PrimeField32,
-    p3_matrix::{
-        dense::{DenseMatrix, RowMajorMatrix},
-        Matrix,
-    },
+    p3_matrix::dense::{DenseMatrix, RowMajorMatrix},
     prover::types::AirProofInput,
     verifier::VerificationError,
     Chip,
@@ -267,21 +263,15 @@ where
             let range_checker = memory_controller.borrow().range_checker.clone();
             self = self.load(memory_tester); // dummy memory interactions
             {
-                let memory = memory_controller.borrow();
-                let public_values = memory.generate_public_values_per_air();
-                let airs = memory.airs();
-                drop(memory);
-                let traces = Rc::try_unwrap(memory_controller)
+                let air_proof_inputs = Rc::try_unwrap(memory_controller)
                     .unwrap()
                     .into_inner()
-                    .generate_traces();
-
-                for (pvs, air, trace) in izip!(public_values, airs, traces) {
-                    if trace.height() > 0 {
-                        self.air_proof_inputs
-                            .push(AirProofInput::simple(air, trace, pvs));
-                    }
-                }
+                    .generate_air_proof_inputs();
+                self.air_proof_inputs.extend(
+                    air_proof_inputs
+                        .into_iter()
+                        .filter(|api| api.main_trace_height() > 0),
+                );
             }
             self = self.load(range_checker); // this must be last because other trace generation mutates its state
         }
diff --git a/crates/vm/src/system/memory/adapter/mod.rs b/crates/vm/src/system/memory/adapter/mod.rs
index 652d918cbc..ff015a19c8 100644
--- a/crates/vm/src/system/memory/adapter/mod.rs
+++ b/crates/vm/src/system/memory/adapter/mod.rs
@@ -7,7 +7,7 @@ use openvm_circuit_primitives::{
     is_less_than::IsLtSubAir, utils::next_power_of_two_or_zero,
     var_range::VariableRangeCheckerChip, TraceSubRowGenerator,
 };
-use openvm_circuit_primitives_derive::ChipUsageGetter;
+use openvm_circuit_primitives_derive::{Chip, ChipUsageGetter};
 use openvm_stark_backend::{
     config::{Domain, StarkGenericConfig, Val},
     p3_air::BaseAir,
@@ -31,6 +31,7 @@ mod tests;
 #[derive(Debug, Clone)]
 pub struct AccessAdapterInventory<F> {
     chips: Vec<GenericAccessAdapterChip<F>>,
+    air_names: Vec<String>,
 }
 
 impl<F> AccessAdapterInventory<F> {
@@ -44,19 +45,19 @@ impl<F> AccessAdapterInventory<F> {
         let mb = memory_bus;
         let cmb = clk_max_bits;
         let maan = max_access_adapter_n;
-        Self {
-            chips: [
-                Self::create_access_adapter_chip::<2>(rc.clone(), mb, cmb, maan),
-                Self::create_access_adapter_chip::<4>(rc.clone(), mb, cmb, maan),
-                Self::create_access_adapter_chip::<8>(rc.clone(), mb, cmb, maan),
-                Self::create_access_adapter_chip::<16>(rc.clone(), mb, cmb, maan),
-                Self::create_access_adapter_chip::<32>(rc.clone(), mb, cmb, maan),
-                Self::create_access_adapter_chip::<64>(rc.clone(), mb, cmb, maan),
-            ]
-            .into_iter()
-            .flatten()
-            .collect(),
-        }
+        let chips: Vec<_> = [
+            Self::create_access_adapter_chip::<2>(rc.clone(), mb, cmb, maan),
+            Self::create_access_adapter_chip::<4>(rc.clone(), mb, cmb, maan),
+            Self::create_access_adapter_chip::<8>(rc.clone(), mb, cmb, maan),
+            Self::create_access_adapter_chip::<16>(rc.clone(), mb, cmb, maan),
+            Self::create_access_adapter_chip::<32>(rc.clone(), mb, cmb, maan),
+            Self::create_access_adapter_chip::<64>(rc.clone(), mb, cmb, maan),
+        ]
+        .into_iter()
+        .flatten()
+        .collect();
+        let air_names = (0..chips.len()).map(|i| air_name(1 << (i + 1))).collect();
+        Self { chips, air_names }
     }
     pub fn num_access_adapters(&self) -> usize {
         self.chips.len()
@@ -80,9 +81,16 @@ impl<F> AccessAdapterInventory<F> {
             .map(|chip| chip.current_trace_height())
             .collect()
     }
+    #[allow(dead_code)]
     pub fn get_widths(&self) -> Vec<usize> {
         self.chips.iter().map(|chip| chip.trace_width()).collect()
     }
+    pub fn get_cells(&self) -> Vec<usize> {
+        self.chips
+            .iter()
+            .map(|chip| chip.current_trace_cells())
+            .collect()
+    }
     pub fn airs<SC: StarkGenericConfig>(&self) -> Vec<Arc<dyn AnyRap<SC>>>
     where
         F: PrimeField32,
@@ -90,23 +98,16 @@ impl<F> AccessAdapterInventory<F> {
     {
         self.chips.iter().map(|chip| chip.air()).collect()
     }
-    pub fn generate_traces(self) -> Vec<RowMajorMatrix<F>>
-    where
-        F: PrimeField32,
-    {
-        self.chips
-            .into_par_iter()
-            .map(|chip| chip.generate_trace())
-            .collect()
+    pub fn air_names(&self) -> Vec<String> {
+        self.air_names.clone()
     }
-    #[allow(dead_code)]
-    pub fn generate_air_proof_input<SC: StarkGenericConfig>(self) -> Vec<AirProofInput<SC>>
+    pub fn generate_air_proof_inputs<SC: StarkGenericConfig>(self) -> Vec<AirProofInput<SC>>
     where
         F: PrimeField32,
         Domain<SC>: PolynomialSpace<Val = F>,
     {
         self.chips
-            .into_par_iter()
+            .into_iter()
             .map(|chip| chip.generate_air_proof_input())
             .collect()
     }
@@ -157,8 +158,9 @@ pub trait GenericAccessAdapterChipTrait<F> {
         F: PrimeField32;
 }
 
-#[derive(Debug, Clone, ChipUsageGetter)]
+#[derive(Debug, Clone, Chip, ChipUsageGetter)]
 #[enum_dispatch(GenericAccessAdapterChipTrait<F>)]
+#[chip(where = "F: PrimeField32")]
 enum GenericAccessAdapterChip<F> {
     N2(AccessAdapterChip<F, 2>),
     N4(AccessAdapterChip<F, 4>),
@@ -168,33 +170,6 @@ enum GenericAccessAdapterChip<F> {
     N64(AccessAdapterChip<F, 64>),
 }
 
-impl<SC: StarkGenericConfig> Chip<SC> for GenericAccessAdapterChip<Val<SC>>
-where
-    Val<SC>: PrimeField32,
-{
-    fn air(&self) -> Arc<dyn AnyRap<SC>> {
-        match self {
-            GenericAccessAdapterChip::N2(chip) => chip.air(),
-            GenericAccessAdapterChip::N4(chip) => chip.air(),
-            GenericAccessAdapterChip::N8(chip) => chip.air(),
-            GenericAccessAdapterChip::N16(chip) => chip.air(),
-            GenericAccessAdapterChip::N32(chip) => chip.air(),
-            GenericAccessAdapterChip::N64(chip) => chip.air(),
-        }
-    }
-
-    fn generate_air_proof_input(self) -> AirProofInput<SC> {
-        match self {
-            GenericAccessAdapterChip::N2(chip) => chip.generate_air_proof_input(),
-            GenericAccessAdapterChip::N4(chip) => chip.generate_air_proof_input(),
-            GenericAccessAdapterChip::N8(chip) => chip.generate_air_proof_input(),
-            GenericAccessAdapterChip::N16(chip) => chip.generate_air_proof_input(),
-            GenericAccessAdapterChip::N32(chip) => chip.generate_air_proof_input(),
-            GenericAccessAdapterChip::N64(chip) => chip.generate_air_proof_input(),
-        }
-    }
-}
-
 impl<F> GenericAccessAdapterChip<F> {
     fn new<const N: usize>(
         range_checker: Arc<VariableRangeCheckerChip>,
@@ -313,7 +288,7 @@ where
 
 impl<F, const N: usize> ChipUsageGetter for AccessAdapterChip<F, N> {
     fn air_name(&self) -> String {
-        format!("AccessAdapter<{}>", N)
+        air_name(N)
     }
 
     fn current_trace_height(&self) -> usize {
@@ -324,3 +299,8 @@ impl<F, const N: usize> ChipUsageGetter for AccessAdapterChip<F, N> {
         BaseAir::<F>::width(&self.air)
     }
 }
+
+#[inline]
+fn air_name(n: usize) -> String {
+    format!("AccessAdapter<{}>", n)
+}
diff --git a/crates/vm/src/system/memory/manager/interface.rs b/crates/vm/src/system/memory/manager/interface.rs
index 1ef03726bb..a3a69d8b1a 100644
--- a/crates/vm/src/system/memory/manager/interface.rs
+++ b/crates/vm/src/system/memory/manager/interface.rs
@@ -7,6 +7,7 @@ use crate::system::memory::{
     Equipartition, CHUNK,
 };
 
+#[allow(clippy::large_enum_variant)]
 #[derive(Debug)]
 pub enum MemoryInterface<F> {
     Volatile {
diff --git a/crates/vm/src/system/memory/manager/mod.rs b/crates/vm/src/system/memory/manager/mod.rs
index 33cb989b49..595257a3c4 100644
--- a/crates/vm/src/system/memory/manager/mod.rs
+++ b/crates/vm/src/system/memory/manager/mod.rs
@@ -9,7 +9,6 @@ use std::{
 };
 
 use getset::Getters;
-use itertools::{izip, zip_eq};
 pub use memory::{MemoryReadRecord, MemoryWriteRecord};
 use openvm_circuit_primitives::{
     assert_less_than::{AssertLtSubAir, LessThanAuxCols},
@@ -21,13 +20,13 @@ use openvm_circuit_primitives::{
 use openvm_instructions::exe::MemoryImage;
 use openvm_stark_backend::{
     config::{Domain, StarkGenericConfig},
-    p3_air::BaseAir,
     p3_commit::PolynomialSpace,
     p3_field::PrimeField32,
-    p3_matrix::dense::RowMajorMatrix,
+    p3_maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator},
     p3_util::log2_strict_usize,
     prover::types::AirProofInput,
     rap::AnyRap,
+    Chip, ChipUsageGetter,
 };
 use serde::{Deserialize, Serialize};
 
@@ -66,12 +65,6 @@ pub struct TimestampedValues<T, const N: usize> {
     pub values: [T; N],
 }
 
-#[derive(Clone, Debug)]
-pub struct MemoryControllerResult<F> {
-    traces: Vec<RowMajorMatrix<F>>,
-    public_values: Vec<Vec<F>>,
-}
-
 pub type MemoryControllerRef<F> = Rc<RefCell<MemoryController<F>>>;
 
 /// A equipartition of memory, with timestamps and values.
@@ -106,11 +99,26 @@ pub struct MemoryController<F> {
     memory: Memory<F>,
 
     access_adapters: AccessAdapterInventory<F>,
-    /// If set, the height of the traces will be overridden.
-    overridden_heights: Option<MemoryTraceHeights>,
 
     // Filled during finalization.
-    result: Option<MemoryControllerResult<F>>,
+    final_state: Option<FinalState<F>>,
+}
+
+#[allow(clippy::large_enum_variant)]
+#[derive(Debug)]
+enum FinalState<F> {
+    Volatile(VolatileFinalState<F>),
+    #[allow(dead_code)]
+    Persistent(PersistentFinalState<F>),
+}
+#[derive(Debug, Default)]
+struct VolatileFinalState<F> {
+    _marker: PhantomData<F>,
+}
+#[allow(dead_code)]
+#[derive(Debug)]
+struct PersistentFinalState<F> {
+    final_memory: Equipartition<F, CHUNK>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -236,8 +244,7 @@ impl<F: PrimeField32> MemoryController<F> {
             ),
             range_checker,
             range_checker_bus,
-            result: None,
-            overridden_heights: None,
+            final_state: None,
         }
     }
 
@@ -279,29 +286,34 @@ impl<F: PrimeField32> MemoryController<F> {
             ),
             range_checker,
             range_checker_bus,
-            result: None,
-            overridden_heights: None,
+            final_state: None,
         }
     }
 
     pub fn set_override_trace_heights(&mut self, overridden_heights: MemoryTraceHeights) {
-        match &self.interface_chip {
-            MemoryInterface::Volatile { .. } => match &overridden_heights {
+        match &mut self.interface_chip {
+            MemoryInterface::Volatile { boundary_chip } => match overridden_heights {
                 MemoryTraceHeights::Volatile(oh) => {
+                    boundary_chip.set_overridden_height(oh.boundary);
                     self.access_adapters
-                        .set_override_trace_heights(oh.access_adapters.clone());
+                        .set_override_trace_heights(oh.access_adapters);
                 }
                 _ => panic!("Expect overridden_heights to be MemoryTraceHeights::Volatile"),
             },
-            MemoryInterface::Persistent { .. } => match &overridden_heights {
+            MemoryInterface::Persistent {
+                boundary_chip,
+                merkle_chip,
+                ..
+            } => match overridden_heights {
                 MemoryTraceHeights::Persistent(oh) => {
+                    boundary_chip.set_overridden_height(oh.boundary);
+                    merkle_chip.set_overridden_height(oh.merkle);
                     self.access_adapters
-                        .set_override_trace_heights(oh.access_adapters.clone());
+                        .set_override_trace_heights(oh.access_adapters);
                 }
                 _ => panic!("Expect overridden_heights to be MemoryTraceHeights::Persistent"),
             },
         }
-        self.overridden_heights = Some(overridden_heights);
     }
 
     pub fn set_initial_memory(&mut self, memory: Equipartition<F, CHUNK>) {
@@ -451,27 +463,15 @@ impl<F: PrimeField32> MemoryController<F> {
         &mut self,
         hasher: Option<&mut impl HasherChip<CHUNK, F>>,
     ) -> Option<Equipartition<F, CHUNK>> {
-        if self.result.is_some() {
+        if self.final_state.is_some() {
             panic!("Cannot finalize more than once");
         }
-        let mut traces = vec![];
-        let mut pvs = vec![];
 
         let (records, final_memory) = match &mut self.interface_chip {
             MemoryInterface::Volatile { boundary_chip } => {
-                let overridden_heights = self.overridden_heights.as_ref().map(|oh| match oh {
-                    MemoryTraceHeights::Volatile(oh) => oh,
-                    _ => unreachable!(),
-                });
                 let (final_memory, records) = self.memory.finalize::<1>();
-                debug_assert_eq!(traces.len(), BOUNDARY_AIR_OFFSET);
-                traces.push(
-                    boundary_chip
-                        .generate_trace(&final_memory, overridden_heights.map(|oh| oh.boundary)),
-                );
-                debug_assert_eq!(pvs.len(), BOUNDARY_AIR_OFFSET);
-                pvs.push(vec![]);
-
+                boundary_chip.finalize(final_memory);
+                self.final_state = Some(FinalState::Volatile(VolatileFinalState::default()));
                 (records, None)
             }
             MemoryInterface::Persistent {
@@ -479,45 +479,24 @@ impl<F: PrimeField32> MemoryController<F> {
                 boundary_chip,
                 initial_memory,
             } => {
-                let overridden_heights = self.overridden_heights.as_ref().map(|oh| match oh {
-                    MemoryTraceHeights::Persistent(oh) => oh,
-                    _ => unreachable!(),
-                });
                 let hasher = hasher.unwrap();
 
-                let (final_partition, records) = self.memory.finalize::<8>();
-                traces.push(boundary_chip.generate_trace(
-                    initial_memory,
-                    &final_partition,
-                    hasher,
-                    overridden_heights.map(|oh| oh.boundary),
-                ));
-                pvs.push(vec![]);
-
+                let (final_partition, records) = self.memory.finalize::<CHUNK>();
+                boundary_chip.finalize(initial_memory, &final_partition, hasher);
                 let final_memory_values = final_partition
-                    .iter()
-                    .map(|(key, value)| (*key, value.values))
+                    .into_par_iter()
+                    .map(|(key, value)| (key, value.values))
                     .collect();
-
                 let initial_node = MemoryNode::tree_from_memory(
                     merkle_chip.air.memory_dimensions,
                     initial_memory,
                     hasher,
                 );
-                let (expand_trace, final_node) = merkle_chip.generate_trace_and_final_tree(
-                    &initial_node,
-                    &final_memory_values,
-                    hasher,
-                    overridden_heights.map(|oh| oh.merkle),
-                );
-
-                debug_assert_eq!(traces.len(), MERKLE_AIR_OFFSET);
-                traces.push(expand_trace);
-                let mut expand_pvs = vec![];
-                expand_pvs.extend(initial_node.hash());
-                expand_pvs.extend(final_node.hash());
-                debug_assert_eq!(pvs.len(), MERKLE_AIR_OFFSET);
-                pvs.push(expand_pvs);
+                merkle_chip.finalize(&initial_node, &final_memory_values, hasher);
+                self.final_state = Some(FinalState::Persistent(PersistentFinalState {
+                    final_memory: final_memory_values.clone(),
+                }));
+                // FIXME: avoid clone here.
                 (records, Some(final_memory_values))
             }
         };
@@ -525,17 +504,6 @@ impl<F: PrimeField32> MemoryController<F> {
             self.access_adapters.add_record(record);
         }
 
-        // FIXME: avoid clone.
-        let aa_traces = self.access_adapters.clone().generate_traces();
-        let aa_pvs = vec![vec![]; aa_traces.len()];
-        traces.extend(aa_traces);
-        pvs.extend(aa_pvs);
-
-        self.result = Some(MemoryControllerResult {
-            traces,
-            public_values: pvs,
-        });
-
         final_memory
     }
 
@@ -543,18 +511,30 @@ impl<F: PrimeField32> MemoryController<F> {
     where
         Domain<SC>: PolynomialSpace<Val = F>,
     {
-        let airs = self.airs();
-        let MemoryControllerResult {
-            traces,
-            public_values,
-        } = self.result.unwrap();
-        izip!(airs, traces, public_values)
-            .map(|(air, trace, pvs)| AirProofInput::simple(air, trace, pvs))
-            .collect()
-    }
+        let mut ret = Vec::new();
 
-    pub fn generate_traces(self) -> Vec<RowMajorMatrix<F>> {
-        self.result.unwrap().traces
+        let Self {
+            interface_chip,
+            access_adapters,
+            ..
+        } = self;
+        match interface_chip {
+            MemoryInterface::Volatile { boundary_chip } => {
+                ret.push(boundary_chip.generate_air_proof_input());
+            }
+            MemoryInterface::Persistent {
+                merkle_chip,
+                boundary_chip,
+                ..
+            } => {
+                debug_assert_eq!(ret.len(), BOUNDARY_AIR_OFFSET);
+                ret.push(boundary_chip.generate_air_proof_input());
+                debug_assert_eq!(ret.len(), MERKLE_AIR_OFFSET);
+                ret.push(merkle_chip.generate_air_proof_input());
+            }
+        }
+        ret.extend(access_adapters.generate_air_proof_inputs());
+        ret
     }
 
     pub fn airs<SC: StarkGenericConfig>(&self) -> Vec<Arc<dyn AnyRap<SC>>>
@@ -566,7 +546,7 @@ impl<F: PrimeField32> MemoryController<F> {
         match &self.interface_chip {
             MemoryInterface::Volatile { boundary_chip } => {
                 debug_assert_eq!(airs.len(), BOUNDARY_AIR_OFFSET);
-                airs.push(Arc::new(boundary_chip.air.clone()))
+                airs.push(boundary_chip.air())
             }
             MemoryInterface::Persistent {
                 boundary_chip,
@@ -574,9 +554,9 @@ impl<F: PrimeField32> MemoryController<F> {
                 ..
             } => {
                 debug_assert_eq!(airs.len(), BOUNDARY_AIR_OFFSET);
-                airs.push(Arc::new(boundary_chip.air.clone()));
+                airs.push(boundary_chip.air());
                 debug_assert_eq!(airs.len(), MERKLE_AIR_OFFSET);
-                airs.push(Arc::new(merkle_chip.air.clone()));
+                airs.push(merkle_chip.air());
             }
         }
         airs.extend(self.access_adapters.airs());
@@ -590,11 +570,7 @@ impl<F: PrimeField32> MemoryController<F> {
         if self.continuation_enabled() {
             num_airs += 1;
         }
-        for n in [2, 4, 8, 16, 32, 64] {
-            if self.mem_config.max_access_adapter_n >= n {
-                num_airs += 1;
-            }
-        }
+        num_airs += self.access_adapters.num_access_adapters();
         num_airs
     }
 
@@ -603,11 +579,7 @@ impl<F: PrimeField32> MemoryController<F> {
         if self.continuation_enabled() {
             air_names.push("Merkle".to_string());
         }
-        for n in [2, 4, 8, 16, 32, 64] {
-            if self.mem_config.max_access_adapter_n >= n {
-                air_names.push(format!("AccessAdapter<{}>", n));
-            }
-        }
+        air_names.extend(self.access_adapters.air_names());
         air_names
     }
 
@@ -620,7 +592,7 @@ impl<F: PrimeField32> MemoryController<F> {
         match &self.interface_chip {
             MemoryInterface::Volatile { boundary_chip } => {
                 MemoryTraceHeights::Volatile(VolatileMemoryTraceHeights {
-                    boundary: boundary_chip.current_height(),
+                    boundary: boundary_chip.current_trace_height(),
                     access_adapters,
                 })
             }
@@ -629,8 +601,8 @@ impl<F: PrimeField32> MemoryController<F> {
                 merkle_chip,
                 ..
             } => MemoryTraceHeights::Persistent(PersistentMemoryTraceHeights {
-                boundary: boundary_chip.current_height(),
-                merkle: merkle_chip.current_height(),
+                boundary: boundary_chip.current_trace_height(),
+                merkle: merkle_chip.current_trace_height(),
                 access_adapters,
             }),
         }
@@ -654,33 +626,23 @@ impl<F: PrimeField32> MemoryController<F> {
         }
     }
 
-    fn trace_widths(&self) -> Vec<usize> {
-        let mut widths = vec![];
+    pub fn current_trace_cells(&self) -> Vec<usize> {
+        let mut ret = Vec::new();
         match &self.interface_chip {
             MemoryInterface::Volatile { boundary_chip } => {
-                widths.push(BaseAir::<F>::width(&boundary_chip.air));
+                ret.push(boundary_chip.current_trace_cells())
             }
             MemoryInterface::Persistent {
                 boundary_chip,
                 merkle_chip,
                 ..
             } => {
-                widths.push(BaseAir::<F>::width(&boundary_chip.air));
-                widths.push(BaseAir::<F>::width(&merkle_chip.air));
+                ret.push(boundary_chip.current_trace_cells());
+                ret.push(merkle_chip.current_trace_cells());
             }
-        };
-        widths.extend(self.access_adapters.get_widths());
-        widths
-    }
-
-    pub fn current_trace_cells(&self) -> Vec<usize> {
-        zip_eq(self.current_trace_heights(), self.trace_widths())
-            .map(|(h, w)| h * w)
-            .collect()
-    }
-
-    pub fn generate_public_values_per_air(&self) -> Vec<Vec<F>> {
-        self.result.as_ref().unwrap().public_values.clone()
+        }
+        ret.extend(self.access_adapters.get_cells());
+        ret
     }
 }
 
diff --git a/crates/vm/src/system/memory/merkle/mod.rs b/crates/vm/src/system/memory/merkle/mod.rs
index 654c86b03b..ad01e25c8b 100644
--- a/crates/vm/src/system/memory/merkle/mod.rs
+++ b/crates/vm/src/system/memory/merkle/mod.rs
@@ -1,5 +1,3 @@
-use std::marker::PhantomData;
-
 use openvm_stark_backend::p3_field::PrimeField32;
 use rustc_hash::FxHashSet;
 
@@ -21,7 +19,14 @@ pub struct MemoryMerkleChip<const CHUNK: usize, F> {
     pub air: MemoryMerkleAir<CHUNK>,
     touched_nodes: FxHashSet<(usize, usize, usize)>,
     num_touched_nonleaves: usize,
-    _marker: PhantomData<F>,
+    final_state: Option<FinalState<CHUNK, F>>,
+    overridden_height: Option<usize>,
+}
+#[derive(Debug)]
+struct FinalState<const CHUNK: usize, F> {
+    rows: Vec<MemoryMerkleCols<F, CHUNK>>,
+    init_root: [F; CHUNK],
+    final_root: [F; CHUNK],
 }
 
 impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
@@ -43,9 +48,13 @@ impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
             },
             touched_nodes,
             num_touched_nonleaves: 1,
-            _marker: PhantomData,
+            final_state: None,
+            overridden_height: None,
         }
     }
+    pub fn set_overridden_height(&mut self, override_height: usize) {
+        self.overridden_height = Some(override_height);
+    }
 
     fn touch_node(&mut self, height: usize, as_label: usize, address_label: usize) {
         if self.touched_nodes.insert((height, as_label, address_label)) {
@@ -68,8 +77,4 @@ impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
             (address.as_canonical_u32() as usize) / CHUNK,
         );
     }
-
-    pub fn current_height(&self) -> usize {
-        2 * self.num_touched_nonleaves
-    }
 }
diff --git a/crates/vm/src/system/memory/merkle/tests/mod.rs b/crates/vm/src/system/memory/merkle/tests/mod.rs
index fc63937a3f..fb800c2d13 100644
--- a/crates/vm/src/system/memory/merkle/tests/mod.rs
+++ b/crates/vm/src/system/memory/merkle/tests/mod.rs
@@ -2,15 +2,18 @@ use std::{
     array,
     borrow::BorrowMut,
     collections::{BTreeMap, BTreeSet, HashSet},
+    sync::Arc,
 };
 
 use openvm_stark_backend::{
     interaction::InteractionType,
     p3_field::{AbstractField, PrimeField32},
     p3_matrix::dense::RowMajorMatrix,
+    prover::types::AirProofInput,
+    Chip, ChipUsageGetter,
 };
 use openvm_stark_sdk::{
-    any_rap_arc_vec, config::baby_bear_poseidon2::BabyBearPoseidon2Engine,
+    config::baby_bear_poseidon2::BabyBearPoseidon2Engine,
     dummy_airs::interaction::dummy_interaction_air::DummyInteractionAir, engine::StarkFriEngine,
     p3_baby_bear::BabyBear, utils::create_seeded_rng,
 };
@@ -80,11 +83,13 @@ fn test<const CHUNK: usize>(
         }
     }
 
-    println!("trace height = {}", chip.current_height());
-    let (trace, final_tree) =
-        chip.generate_trace_and_final_tree(&initial_tree, final_memory, &mut hash_test_chip, None);
-
-    assert_eq!(final_tree, final_tree_check);
+    println!("trace height = {}", chip.current_trace_height());
+    chip.finalize(&initial_tree, final_memory, &mut hash_test_chip);
+    assert_eq!(
+        chip.final_state.as_ref().unwrap().final_root,
+        final_tree_check.hash()
+    );
+    let chip_api = chip.generate_air_proof_input();
 
     let dummy_interaction_air = DummyInteractionAir::new(4 + CHUNK, true, merkle_bus.0);
     let mut dummy_interaction_trace_rows = vec![];
@@ -145,17 +150,14 @@ fn test<const CHUNK: usize>(
         dummy_interaction_trace_rows,
         dummy_interaction_air.field_width() + 1,
     );
-
-    let mut public_values = vec![vec![]; 3];
-    public_values[0].extend(initial_tree.hash());
-    public_values[0].extend(final_tree_check.hash());
-
-    let hash_test_chip_air = hash_test_chip.air();
-    BabyBearPoseidon2Engine::run_simple_test_fast(
-        any_rap_arc_vec![chip.air, dummy_interaction_air, hash_test_chip_air],
-        vec![trace, dummy_interaction_trace, hash_test_chip.trace()],
-        public_values,
-    )
+    let dummy_interaction_api =
+        AirProofInput::simple_no_pis(Arc::new(dummy_interaction_air), dummy_interaction_trace);
+
+    BabyBearPoseidon2Engine::run_test_fast(vec![
+        chip_api,
+        dummy_interaction_api,
+        hash_test_chip.generate_air_proof_input(),
+    ])
     .expect("Verification failed");
 }
 
@@ -251,18 +253,11 @@ fn expand_test_no_accesses() {
         COMPRESSION_BUS,
     );
 
-    let (trace, _) = chip.generate_trace_and_final_tree(&tree, &memory, &mut hash_test_chip, None);
-
-    let mut public_values = vec![vec![]; 2];
-    public_values[0].extend(tree.hash());
-    public_values[0].extend(tree.hash());
-
-    let hash_test_chip_air = hash_test_chip.air();
-    BabyBearPoseidon2Engine::run_simple_test_fast(
-        any_rap_arc_vec![chip.air, hash_test_chip_air],
-        vec![trace, hash_test_chip.trace()],
-        public_values,
-    )
+    chip.finalize(&tree, &memory, &mut hash_test_chip);
+    BabyBearPoseidon2Engine::run_test_fast(vec![
+        chip.generate_air_proof_input(),
+        hash_test_chip.generate_air_proof_input(),
+    ])
     .expect("This should occur");
 }
 
@@ -290,25 +285,22 @@ fn expand_test_negative() {
         COMPRESSION_BUS,
     );
 
-    let (mut trace, _) =
-        chip.generate_trace_and_final_tree(&tree, &memory, &mut hash_test_chip, None);
-    for row in trace.rows_mut() {
-        let row: &mut MemoryMerkleCols<_, DEFAULT_CHUNK> = row.borrow_mut();
-        if row.expand_direction == BabyBear::NEG_ONE {
-            row.left_direction_different = BabyBear::ZERO;
-            row.right_direction_different = BabyBear::ZERO;
+    chip.finalize(&tree, &memory, &mut hash_test_chip);
+    let mut chip_api = chip.generate_air_proof_input();
+    {
+        let trace = chip_api.raw.common_main.as_mut().unwrap();
+        for row in trace.rows_mut() {
+            let row: &mut MemoryMerkleCols<_, DEFAULT_CHUNK> = row.borrow_mut();
+            if row.expand_direction == BabyBear::NEG_ONE {
+                row.left_direction_different = BabyBear::ZERO;
+                row.right_direction_different = BabyBear::ZERO;
+            }
         }
     }
 
-    let mut public_values = vec![vec![]; 2];
-    public_values[0].extend(tree.hash());
-    public_values[0].extend(tree.hash());
-
-    let hash_test_chip_air = hash_test_chip.air();
-    BabyBearPoseidon2Engine::run_simple_test_fast(
-        any_rap_arc_vec![chip.air, hash_test_chip_air],
-        vec![trace, hash_test_chip.trace()],
-        public_values,
-    )
+    BabyBearPoseidon2Engine::run_test_fast(vec![
+        chip_api,
+        hash_test_chip.generate_air_proof_input(),
+    ])
     .expect("This should occur");
 }
diff --git a/crates/vm/src/system/memory/merkle/tests/util.rs b/crates/vm/src/system/memory/merkle/tests/util.rs
index f5104fda98..3bd7a500e3 100644
--- a/crates/vm/src/system/memory/merkle/tests/util.rs
+++ b/crates/vm/src/system/memory/merkle/tests/util.rs
@@ -1,6 +1,13 @@
-use std::array::from_fn;
+use std::{array::from_fn, sync::Arc};
 
-use openvm_stark_backend::{p3_air::BaseAir, p3_field::Field, p3_matrix::dense::RowMajorMatrix};
+use openvm_stark_backend::{
+    config::{Domain, StarkGenericConfig},
+    p3_air::BaseAir,
+    p3_commit::PolynomialSpace,
+    p3_field::Field,
+    p3_matrix::dense::RowMajorMatrix,
+    prover::types::AirProofInput,
+};
 use openvm_stark_sdk::dummy_airs::interaction::dummy_interaction_air::DummyInteractionAir;
 
 use crate::arch::{
@@ -40,6 +47,12 @@ impl<const CHUNK: usize, F: Field> HashTestChip<CHUNK, F> {
         }
         RowMajorMatrix::new(rows, width)
     }
+    pub fn generate_air_proof_input<SC: StarkGenericConfig>(&self) -> AirProofInput<SC>
+    where
+        Domain<SC>: PolynomialSpace<Val = F>,
+    {
+        AirProofInput::simple_no_pis(Arc::new(self.air()), self.trace())
+    }
 }
 
 impl<const CHUNK: usize, F: Field> Hasher<CHUNK, F> for HashTestChip<CHUNK, F> {
diff --git a/crates/vm/src/system/memory/merkle/trace.rs b/crates/vm/src/system/memory/merkle/trace.rs
index 6c9f21fa8d..8045e7fc92 100644
--- a/crates/vm/src/system/memory/merkle/trace.rs
+++ b/crates/vm/src/system/memory/merkle/trace.rs
@@ -1,26 +1,33 @@
 use std::{borrow::BorrowMut, cmp::Reverse, sync::Arc};
 
-use openvm_stark_backend::{p3_field::PrimeField32, p3_matrix::dense::RowMajorMatrix};
+use openvm_stark_backend::{
+    config::{StarkGenericConfig, Val},
+    p3_field::{AbstractField, PrimeField32},
+    p3_matrix::dense::RowMajorMatrix,
+    prover::types::AirProofInput,
+    rap::AnyRap,
+    Chip, ChipUsageGetter,
+};
 use rustc_hash::FxHashSet;
 
 use crate::{
     arch::hasher::HasherChip,
     system::memory::{
         manager::dimensions::MemoryDimensions,
-        merkle::{MemoryMerkleChip, MemoryMerkleCols},
+        merkle::{FinalState, MemoryMerkleChip, MemoryMerkleCols},
         tree::MemoryNode::{self, NonLeaf},
         Equipartition,
     },
 };
 
 impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
-    pub fn generate_trace_and_final_tree(
+    pub fn finalize(
         &mut self,
         initial_tree: &MemoryNode<CHUNK, F>,
         final_memory: &Equipartition<F, CHUNK>,
         hasher: &mut impl HasherChip<CHUNK, F>,
-        overridden_height: Option<usize>,
-    ) -> (RowMajorMatrix<F>, MemoryNode<CHUNK, F>) {
+    ) {
+        assert!(self.final_state.is_none(), "Merkle chip already finalized");
         // there needs to be a touched node with `height_section` = 0
         // shouldn't be a leaf because
         // trace generation will expect an interaction from MemoryInterfaceChip in that case
@@ -42,13 +49,41 @@ impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
             0,
             hasher,
         );
+        self.final_state = Some(FinalState {
+            rows,
+            init_root: initial_tree.hash(),
+            final_root: final_tree.hash(),
+        });
+    }
+}
+
+impl<const CHUNK: usize, SC: StarkGenericConfig> Chip<SC> for MemoryMerkleChip<CHUNK, Val<SC>>
+where
+    Val<SC>: PrimeField32,
+{
+    fn air(&self) -> Arc<dyn AnyRap<SC>> {
+        Arc::new(self.air.clone())
+    }
+
+    fn generate_air_proof_input(self) -> AirProofInput<SC> {
+        let air = Arc::new(self.air);
+        assert!(
+            self.final_state.is_some(),
+            "Merkle chip must finalize before trace generation"
+        );
+        let FinalState {
+            mut rows,
+            init_root,
+            final_root,
+        } = self.final_state.unwrap();
         // important that this sort be stable,
         // because we need the initial root to be first and the final root to be second
+        // TODO: do we only need find all height == 0 instead of sorting?
         rows.sort_by_key(|row| Reverse(row.parent_height));
 
-        let width = MemoryMerkleCols::<F, CHUNK>::width();
+        let width = MemoryMerkleCols::<Val<SC>, CHUNK>::width();
         let mut height = rows.len().next_power_of_two();
-        if let Some(mut oh) = overridden_height {
+        if let Some(mut oh) = self.overridden_height {
             oh = oh.next_power_of_two();
             assert!(
                 oh >= height,
@@ -56,14 +91,28 @@ impl<const CHUNK: usize, F: PrimeField32> MemoryMerkleChip<CHUNK, F> {
             );
             height = oh;
         }
-        let mut trace = F::zero_vec(width * height);
+        let mut trace = Val::<SC>::zero_vec(width * height);
 
         for (trace_row, row) in trace.chunks_exact_mut(width).zip(rows) {
             *trace_row.borrow_mut() = row;
         }
 
         let trace = RowMajorMatrix::new(trace, width);
-        (trace, final_tree)
+        let pvs = init_root.into_iter().chain(final_root).collect();
+        AirProofInput::simple(air, trace, pvs)
+    }
+}
+impl<const CHUNK: usize, F: PrimeField32> ChipUsageGetter for MemoryMerkleChip<CHUNK, F> {
+    fn air_name(&self) -> String {
+        "Merkle".to_string()
+    }
+
+    fn current_trace_height(&self) -> usize {
+        2 * self.num_touched_nonleaves
+    }
+
+    fn trace_width(&self) -> usize {
+        MemoryMerkleCols::<F, CHUNK>::width()
     }
 }
 
diff --git a/crates/vm/src/system/memory/persistent.rs b/crates/vm/src/system/memory/persistent.rs
index e854e591af..57eb71d633 100644
--- a/crates/vm/src/system/memory/persistent.rs
+++ b/crates/vm/src/system/memory/persistent.rs
@@ -1,15 +1,22 @@
 use std::{
     borrow::{Borrow, BorrowMut},
     iter,
+    sync::Arc,
 };
 
 use openvm_circuit_primitives_derive::AlignedBorrow;
+#[allow(unused_imports)]
+use openvm_stark_backend::p3_maybe_rayon::prelude::IndexedParallelIterator;
 use openvm_stark_backend::{
+    config::{StarkGenericConfig, Val},
     interaction::InteractionBuilder,
     p3_air::{Air, BaseAir},
     p3_field::{AbstractField, PrimeField32},
     p3_matrix::{dense::RowMajorMatrix, Matrix},
-    rap::{BaseAirWithPublicValues, PartitionedBaseAir},
+    p3_maybe_rayon::prelude::{IntoParallelIterator, ParallelIterator, ParallelSliceMut},
+    prover::types::AirProofInput,
+    rap::{AnyRap, BaseAirWithPublicValues, PartitionedBaseAir},
+    Chip, ChipUsageGetter,
 };
 use rustc_hash::FxHashSet;
 
@@ -115,7 +122,49 @@ impl<const CHUNK: usize, AB: InteractionBuilder> Air<AB> for PersistentBoundaryA
 #[derive(Debug)]
 pub struct PersistentBoundaryChip<F, const CHUNK: usize> {
     pub air: PersistentBoundaryAir<CHUNK>,
-    touched_labels: FxHashSet<(F, usize)>,
+    touched_labels: TouchedLabels<F, CHUNK>,
+    overridden_height: Option<usize>,
+}
+
+#[derive(Debug)]
+enum TouchedLabels<F, const CHUNK: usize> {
+    Running(FxHashSet<(F, usize)>),
+    Final(Vec<FinalTouchedLabel<F, CHUNK>>),
+}
+
+#[derive(Debug)]
+struct FinalTouchedLabel<F, const CHUNK: usize> {
+    address_space: F,
+    label: usize,
+    init_values: [F; CHUNK],
+    final_values: [F; CHUNK],
+    init_exists: bool,
+    init_hash: [F; CHUNK],
+    final_hash: [F; CHUNK],
+    final_timestamp: u32,
+}
+
+impl<F: PrimeField32, const CHUNK: usize> Default for TouchedLabels<F, CHUNK> {
+    fn default() -> Self {
+        Self::Running(FxHashSet::default())
+    }
+}
+
+impl<F: PrimeField32, const CHUNK: usize> TouchedLabels<F, CHUNK> {
+    fn touch(&mut self, address_space: F, label: usize) {
+        match self {
+            TouchedLabels::Running(touched_labels) => {
+                touched_labels.insert((address_space, label));
+            }
+            _ => panic!("Cannot touch after finalization"),
+        }
+    }
+    fn len(&self) -> usize {
+        match self {
+            TouchedLabels::Running(touched_labels) => touched_labels.len(),
+            TouchedLabels::Final(touched_labels) => touched_labels.len(),
+        }
+    }
 }
 
 impl<const CHUNK: usize, F: PrimeField32> PersistentBoundaryChip<F, CHUNK> {
@@ -132,78 +181,133 @@ impl<const CHUNK: usize, F: PrimeField32> PersistentBoundaryChip<F, CHUNK> {
                 merkle_bus,
                 compression_bus,
             },
-            touched_labels: FxHashSet::default(),
+            touched_labels: Default::default(),
+            overridden_height: None,
         }
     }
 
-    pub fn touch_address(&mut self, address_space: F, pointer: F) {
-        let label = pointer.as_canonical_u32() as usize / CHUNK;
-        self.touched_labels.insert((address_space, label));
+    pub fn set_overridden_height(&mut self, overridden_height: usize) {
+        self.overridden_height = Some(overridden_height);
     }
 
-    pub fn current_height(&self) -> usize {
-        2 * self.touched_labels.len()
+    pub fn touch_address(&mut self, address_space: F, pointer: F) {
+        let label = pointer.as_canonical_u32() as usize / CHUNK;
+        self.touched_labels.touch(address_space, label);
     }
 
-    pub fn generate_trace(
-        &self,
+    pub fn finalize(
+        &mut self,
         initial_memory: &Equipartition<F, CHUNK>,
         final_memory: &TimestampedEquipartition<F, CHUNK>,
         hasher: &mut impl HasherChip<CHUNK, F>,
-        overridden_height: Option<usize>,
-    ) -> RowMajorMatrix<F> {
-        let width = PersistentBoundaryCols::<F, CHUNK>::width();
-        // Boundary AIR should always present in order to fix the AIR ID of merkle AIR.
-        let mut height = (2 * self.touched_labels.len()).next_power_of_two();
-        if let Some(mut oh) = overridden_height {
-            oh = oh.next_power_of_two();
-            assert!(
-                oh >= height,
-                "Overridden height is less than the required height"
-            );
-            height = oh;
+    ) {
+        match &mut self.touched_labels {
+            TouchedLabels::Running(touched_labels) => {
+                // TODO: parallelize this.
+                let final_touched_labels = touched_labels
+                    .iter()
+                    .map(|touched_label| {
+                        let (init_exists, initial_hash, init_values) =
+                            match initial_memory.get(touched_label) {
+                                Some(values) => (true, hasher.hash_and_record(values), *values),
+                                None => (
+                                    true,
+                                    hasher.hash_and_record(&[F::ZERO; CHUNK]),
+                                    [F::ZERO; CHUNK],
+                                ),
+                            };
+                        let timestamped_values = final_memory.get(touched_label).unwrap();
+                        let final_hash = hasher.hash_and_record(&timestamped_values.values);
+                        FinalTouchedLabel {
+                            address_space: touched_label.0,
+                            label: touched_label.1,
+                            init_values,
+                            final_values: timestamped_values.values,
+                            init_exists,
+                            init_hash: initial_hash,
+                            final_hash,
+                            final_timestamp: timestamped_values.timestamp,
+                        }
+                    })
+                    .collect();
+                self.touched_labels = TouchedLabels::Final(final_touched_labels);
+            }
+            _ => panic!("Cannot finalize after finalization"),
         }
-        let mut rows = F::zero_vec(height * width);
-
-        for (row, &(address_space, label)) in
-            rows.chunks_mut(2 * width).zip(self.touched_labels.iter())
-        {
-            let (initial_row, final_row) = row.split_at_mut(width);
-            *initial_row.borrow_mut() = match initial_memory.get(&(address_space, label)) {
-                Some(values) => {
-                    let initial_hash = hasher.hash_and_record(values);
-                    PersistentBoundaryCols {
-                        expand_direction: F::ONE,
-                        address_space,
-                        leaf_label: F::from_canonical_usize(label),
-                        values: *values,
-                        hash: initial_hash,
-                        timestamp: F::from_canonical_u32(INITIAL_TIMESTAMP),
-                    }
-                }
-                None => {
-                    let initial_hash = hasher.hash_and_record(&[F::ZERO; CHUNK]);
-                    PersistentBoundaryCols {
-                        expand_direction: F::ONE,
-                        address_space,
-                        leaf_label: F::from_canonical_usize(label),
-                        values: [F::ZERO; CHUNK],
-                        hash: initial_hash,
-                        timestamp: F::ZERO,
-                    }
-                }
-            };
-            let timestamped_values = final_memory.get(&(address_space, label)).unwrap();
-            let final_hash = hasher.hash_and_record(&timestamped_values.values);
-            *final_row.borrow_mut() = PersistentBoundaryCols {
-                expand_direction: F::NEG_ONE,
-                address_space,
-                leaf_label: F::from_canonical_usize(label),
-                values: timestamped_values.values,
-                hash: final_hash,
-                timestamp: F::from_canonical_u32(timestamped_values.timestamp),
+    }
+}
+
+impl<const CHUNK: usize, SC: StarkGenericConfig> Chip<SC> for PersistentBoundaryChip<Val<SC>, CHUNK>
+where
+    Val<SC>: PrimeField32,
+{
+    fn air(&self) -> Arc<dyn AnyRap<SC>> {
+        Arc::new(self.air.clone())
+    }
+
+    fn generate_air_proof_input(self) -> AirProofInput<SC> {
+        let air = Arc::new(self.air);
+        let trace = {
+            let width = PersistentBoundaryCols::<Val<SC>, CHUNK>::width();
+            // Boundary AIR should always present in order to fix the AIR ID of merkle AIR.
+            let mut height = (2 * self.touched_labels.len()).next_power_of_two();
+            if let Some(mut oh) = self.overridden_height {
+                oh = oh.next_power_of_two();
+                assert!(
+                    oh >= height,
+                    "Overridden height is less than the required height"
+                );
+                height = oh;
+            }
+            let mut rows = Val::<SC>::zero_vec(height * width);
+
+            let touched_labels = match self.touched_labels {
+                TouchedLabels::Final(touched_labels) => touched_labels,
+                _ => panic!("Cannot generate trace before finalization"),
             };
-        }
-        RowMajorMatrix::new(rows, width)
+
+            rows.par_chunks_mut(2 * width)
+                .zip(touched_labels.into_par_iter())
+                .for_each(|(row, touched_label)| {
+                    let (initial_row, final_row) = row.split_at_mut(width);
+                    *initial_row.borrow_mut() = PersistentBoundaryCols {
+                        expand_direction: Val::<SC>::ONE,
+                        address_space: touched_label.address_space,
+                        leaf_label: Val::<SC>::from_canonical_usize(touched_label.label),
+                        values: touched_label.init_values,
+                        hash: touched_label.init_hash,
+                        timestamp: if touched_label.init_exists {
+                            Val::<SC>::from_canonical_u32(INITIAL_TIMESTAMP)
+                        } else {
+                            Val::<SC>::ZERO
+                        },
+                    };
+
+                    *final_row.borrow_mut() = PersistentBoundaryCols {
+                        expand_direction: Val::<SC>::NEG_ONE,
+                        address_space: touched_label.address_space,
+                        leaf_label: Val::<SC>::from_canonical_usize(touched_label.label),
+                        values: touched_label.final_values,
+                        hash: touched_label.final_hash,
+                        timestamp: Val::<SC>::from_canonical_u32(touched_label.final_timestamp),
+                    };
+                });
+            RowMajorMatrix::new(rows, width)
+        };
+        AirProofInput::simple_no_pis(air, trace)
+    }
+}
+
+impl<const CHUNK: usize, F: PrimeField32> ChipUsageGetter for PersistentBoundaryChip<F, CHUNK> {
+    fn air_name(&self) -> String {
+        "Boundary".to_string()
+    }
+
+    fn current_trace_height(&self) -> usize {
+        2 * self.touched_labels.len()
+    }
+
+    fn trace_width(&self) -> usize {
+        PersistentBoundaryCols::<F, CHUNK>::width()
     }
 }
diff --git a/crates/vm/src/system/memory/volatile/mod.rs b/crates/vm/src/system/memory/volatile/mod.rs
index b29532d975..04e0c68579 100644
--- a/crates/vm/src/system/memory/volatile/mod.rs
+++ b/crates/vm/src/system/memory/volatile/mod.rs
@@ -14,12 +14,15 @@ use openvm_circuit_primitives::{
 };
 use openvm_circuit_primitives_derive::AlignedBorrow;
 use openvm_stark_backend::{
+    config::{StarkGenericConfig, Val},
     interaction::InteractionBuilder,
     p3_air::{Air, AirBuilder, BaseAir},
     p3_field::{AbstractField, Field, PrimeField32},
     p3_matrix::{dense::RowMajorMatrix, Matrix},
     p3_maybe_rayon::prelude::*,
-    rap::{BaseAirWithPublicValues, PartitionedBaseAir},
+    prover::types::AirProofInput,
+    rap::{AnyRap, BaseAirWithPublicValues, PartitionedBaseAir},
+    Chip, ChipUsageGetter,
 };
 
 use super::TimestampedEquipartition;
@@ -132,6 +135,8 @@ pub struct VolatileBoundaryChip<F> {
     pub air: VolatileBoundaryAir,
     touched_addresses: HashSet<(F, F)>,
     range_checker: Arc<VariableRangeCheckerChip>,
+    overridden_height: Option<usize>,
+    final_memory: Option<TimestampedEquipartition<F, 1>>,
 }
 
 impl<F: Field> VolatileBoundaryChip<F> {
@@ -151,6 +156,8 @@ impl<F: Field> VolatileBoundaryChip<F> {
             ),
             touched_addresses: HashSet::new(),
             range_checker,
+            overridden_height: None,
+            final_memory: None,
         }
     }
 
@@ -161,21 +168,36 @@ impl<F: Field> VolatileBoundaryChip<F> {
     pub fn all_addresses(&self) -> Vec<(F, F)> {
         self.touched_addresses.iter().cloned().collect()
     }
-
-    pub fn current_height(&self) -> usize {
-        self.touched_addresses.len()
-    }
 }
 
 impl<F: PrimeField32> VolatileBoundaryChip<F> {
+    pub fn set_overridden_height(&mut self, overridden_height: usize) {
+        self.overridden_height = Some(overridden_height);
+    }
     /// Volatile memory requires the starting and final memory to be in equipartition with block size `1`.
     /// When block size is `1`, then the `label` is the same as the address pointer.
-    pub fn generate_trace(
-        &self,
-        final_memory: &TimestampedEquipartition<F, 1>,
-        overridden_height: Option<usize>,
-    ) -> RowMajorMatrix<F> {
-        let trace_height = if let Some(height) = overridden_height {
+    pub fn finalize(&mut self, final_memory: TimestampedEquipartition<F, 1>) {
+        self.final_memory = Some(final_memory);
+    }
+}
+
+impl<SC: StarkGenericConfig> Chip<SC> for VolatileBoundaryChip<Val<SC>>
+where
+    Val<SC>: PrimeField32,
+{
+    fn air(&self) -> Arc<dyn AnyRap<SC>> {
+        Arc::new(self.air.clone())
+    }
+
+    fn generate_air_proof_input(self) -> AirProofInput<SC> {
+        // Volatile memory requires the starting and final memory to be in equipartition with block size `1`.
+        // When block size is `1`, then the `label` is the same as the address pointer.
+        let width = self.trace_width();
+        let air = Arc::new(self.air);
+        let final_memory = self
+            .final_memory
+            .expect("Trace generation should be called after finalize");
+        let trace_height = if let Some(height) = self.overridden_height {
             assert!(
                 height >= final_memory.len(),
                 "Overridden height is less than the required height"
@@ -184,65 +206,71 @@ impl<F: PrimeField32> VolatileBoundaryChip<F> {
         } else {
             final_memory.len()
         };
-        self.generate_trace_with_height(final_memory, trace_height.next_power_of_two())
-    }
-
-    fn generate_trace_with_height(
-        &self,
-        final_memory: &TimestampedEquipartition<F, 1>,
-        trace_height: usize,
-    ) -> RowMajorMatrix<F> {
-        assert!(trace_height.is_power_of_two());
-        let width = BaseAir::<F>::width(&self.air);
+        let trace_height = trace_height.next_power_of_two();
 
         // Collect into Vec to sort from BTreeMap and also so we can look at adjacent entries
-        let sorted_final_memory: Vec<_> = final_memory.iter().collect();
-        assert!(sorted_final_memory.len() <= trace_height);
+        let sorted_final_memory: Vec<_> = final_memory.into_par_iter().collect();
+        let memory_len = sorted_final_memory.len();
 
-        let mut rows = F::zero_vec(trace_height * width);
+        let mut rows = Val::<SC>::zero_vec(trace_height * width);
         rows.par_chunks_mut(width)
-            .zip(&sorted_final_memory)
+            .zip(sorted_final_memory.par_iter())
             .enumerate()
             .for_each(|(i, (row, ((addr_space, ptr), timestamped_values)))| {
                 // `pointer` is the same as `label` since the equipartition has block size 1
                 let [data] = timestamped_values.values;
                 let row: &mut VolatileBoundaryCols<_> = row.borrow_mut();
                 row.addr_space = *addr_space;
-                row.pointer = F::from_canonical_usize(*ptr);
-                row.initial_data = F::ZERO;
+                row.pointer = Val::<SC>::from_canonical_usize(*ptr);
+                row.initial_data = Val::<SC>::ZERO;
                 row.final_data = data;
-                row.final_timestamp = F::from_canonical_u32(timestamped_values.timestamp);
-                row.is_valid = F::ONE;
+                row.final_timestamp = Val::<SC>::from_canonical_u32(timestamped_values.timestamp);
+                row.is_valid = Val::<SC>::ONE;
 
                 // If next.is_valid == 1:
-                if i != sorted_final_memory.len() - 1 {
-                    let (next_addr_space, next_ptr) = *sorted_final_memory[i + 1].0;
-                    let mut out = F::ZERO;
-                    self.air.addr_lt_air.0.generate_subrow(
+                if i != memory_len - 1 {
+                    let (next_addr_space, next_ptr) = sorted_final_memory[i + 1].0;
+                    let mut out = Val::<SC>::ZERO;
+                    air.addr_lt_air.0.generate_subrow(
                         (
                             &self.range_checker,
                             &[row.addr_space, row.pointer],
-                            &[next_addr_space, F::from_canonical_usize(next_ptr)],
+                            &[next_addr_space, Val::<SC>::from_canonical_usize(next_ptr)],
                         ),
                         ((&mut row.addr_lt_aux).into(), &mut out),
                     );
-                    debug_assert_eq!(out, F::ONE, "Addresses are not sorted");
+                    debug_assert_eq!(out, Val::<SC>::ONE, "Addresses are not sorted");
                 }
             });
         // Always do a dummy range check on the last row due to wraparound
-        if !sorted_final_memory.is_empty() {
-            let mut out = F::ZERO;
+        if memory_len > 0 {
+            let mut out = Val::<SC>::ZERO;
             let row: &mut VolatileBoundaryCols<_> = rows[width * (trace_height - 1)..].borrow_mut();
-            self.air.addr_lt_air.0.generate_subrow(
+            air.addr_lt_air.0.generate_subrow(
                 (
                     &self.range_checker,
-                    &[F::ZERO, F::ZERO],
-                    &[F::ZERO, F::ZERO],
+                    &[Val::<SC>::ZERO, Val::<SC>::ZERO],
+                    &[Val::<SC>::ZERO, Val::<SC>::ZERO],
                 ),
                 ((&mut row.addr_lt_aux).into(), &mut out),
             );
         }
 
-        RowMajorMatrix::new(rows, width)
+        let trace = RowMajorMatrix::new(rows, width);
+        AirProofInput::simple_no_pis(air, trace)
+    }
+}
+
+impl<F: PrimeField32> ChipUsageGetter for VolatileBoundaryChip<F> {
+    fn air_name(&self) -> String {
+        "Boundary".to_string()
+    }
+
+    fn current_trace_height(&self) -> usize {
+        self.touched_addresses.len()
+    }
+
+    fn trace_width(&self) -> usize {
+        VolatileBoundaryCols::<F>::width()
     }
 }
diff --git a/crates/vm/src/system/memory/volatile/tests.rs b/crates/vm/src/system/memory/volatile/tests.rs
index 6b00e01a9b..eaaf3bb674 100644
--- a/crates/vm/src/system/memory/volatile/tests.rs
+++ b/crates/vm/src/system/memory/volatile/tests.rs
@@ -3,12 +3,16 @@ use std::{collections::HashSet, iter, sync::Arc};
 use openvm_circuit_primitives::var_range::{VariableRangeCheckerBus, VariableRangeCheckerChip};
 use openvm_stark_backend::{
     p3_field::{AbstractField, PrimeField32},
-    p3_matrix::{dense::RowMajorMatrix, Matrix},
+    p3_matrix::dense::RowMajorMatrix,
+    prover::types::AirProofInput,
+    Chip,
 };
 use openvm_stark_sdk::{
-    any_rap_arc_vec, config::baby_bear_poseidon2::BabyBearPoseidon2Engine,
-    dummy_airs::interaction::dummy_interaction_air::DummyInteractionAir, engine::StarkFriEngine,
-    p3_baby_bear::BabyBear, utils::create_seeded_rng,
+    config::baby_bear_poseidon2::{BabyBearPoseidon2Config, BabyBearPoseidon2Engine},
+    dummy_airs::interaction::dummy_interaction_air::DummyInteractionAir,
+    engine::StarkFriEngine,
+    p3_baby_bear::BabyBear,
+    utils::create_seeded_rng,
 };
 use rand::Rng;
 use test_log::test;
@@ -42,7 +46,8 @@ fn boundary_air_test() {
 
     let range_bus = VariableRangeCheckerBus::new(RANGE_CHECKER_BUS, DECOMP);
     let range_checker = Arc::new(VariableRangeCheckerChip::new(range_bus));
-    let boundary_chip = VolatileBoundaryChip::new(memory_bus, 2, LIMB_BITS, range_checker.clone());
+    let mut boundary_chip =
+        VolatileBoundaryChip::new(memory_bus, 2, LIMB_BITS, range_checker.clone());
 
     let mut final_memory = TimestampedEquipartition::new();
 
@@ -104,35 +109,30 @@ fn boundary_air_test() {
         6,
     );
 
-    let boundary_trace = boundary_chip.generate_trace(&final_memory, None);
+    boundary_chip.finalize(final_memory.clone());
+    let boundary_api: AirProofInput<BabyBearPoseidon2Config> =
+        boundary_chip.generate_air_proof_input();
     // test trace height override
     {
-        let overridden_height = boundary_trace.height() * 2;
+        let overridden_height = boundary_api.main_trace_height() * 2;
         let range_checker = Arc::new(VariableRangeCheckerChip::new(range_bus));
-        let boundary_chip =
+        let mut boundary_chip =
             VolatileBoundaryChip::new(memory_bus, 2, LIMB_BITS, range_checker.clone());
-        let boundary_trace = boundary_chip.generate_trace(&final_memory, Some(overridden_height));
+        boundary_chip.set_overridden_height(overridden_height);
+        boundary_chip.finalize(final_memory.clone());
+        let boundary_api: AirProofInput<BabyBearPoseidon2Config> =
+            boundary_chip.generate_air_proof_input();
         assert_eq!(
-            boundary_trace.height(),
+            boundary_api.main_trace_height(),
             overridden_height.next_power_of_two()
         );
     }
 
-    let range_checker_trace = range_checker.generate_trace();
-
-    BabyBearPoseidon2Engine::run_simple_test_no_pis_fast(
-        any_rap_arc_vec![
-            boundary_chip.air,
-            range_checker.air,
-            init_memory_dummy_air,
-            final_memory_dummy_air
-        ],
-        vec![
-            boundary_trace,
-            range_checker_trace,
-            init_memory_trace,
-            final_memory_trace,
-        ],
-    )
+    BabyBearPoseidon2Engine::run_test_fast(vec![
+        boundary_api,
+        range_checker.generate_air_proof_input(),
+        AirProofInput::simple_no_pis(Arc::new(init_memory_dummy_air), init_memory_trace),
+        AirProofInput::simple_no_pis(Arc::new(final_memory_dummy_air), final_memory_trace),
+    ])
     .expect("Verification failed");
 }
diff --git a/crates/vm/src/system/poseidon2/trace.rs b/crates/vm/src/system/poseidon2/trace.rs
index b866ac95f9..0e6ad93ccd 100644
--- a/crates/vm/src/system/poseidon2/trace.rs
+++ b/crates/vm/src/system/poseidon2/trace.rs
@@ -6,10 +6,13 @@ use openvm_stark_backend::{
     p3_air::BaseAir,
     p3_field::PrimeField32,
     p3_matrix::dense::RowMajorMatrix,
+    p3_maybe_rayon::prelude::*,
     prover::types::AirProofInput,
     rap::{get_air_name, AnyRap},
     Chip, ChipUsageGetter,
 };
+#[cfg(feature = "parallel")]
+use rayon::iter::ParallelExtend;
 
 use super::{columns::*, Poseidon2Chip};
 
@@ -35,12 +38,21 @@ where
 
         let aux_cols_factory = memory_controller.borrow().aux_cols_factory();
         let mut flat_rows: Vec<_> = records
-            .into_iter()
+            .into_par_iter()
             .flat_map(|record| Self::record_to_cols(&aux_cols_factory, record).flatten())
             .collect();
-        for _ in 0..diff {
-            flat_rows.extend(Poseidon2VmCols::<Val<SC>>::blank_row(&air).flatten());
-        }
+        #[cfg(feature = "parallel")]
+        flat_rows.par_extend(
+            vec![Poseidon2VmCols::<Val<SC>>::blank_row(&air).flatten(); diff]
+                .into_par_iter()
+                .flatten(),
+        );
+        #[cfg(not(feature = "parallel"))]
+        flat_rows.extend(
+            vec![Poseidon2VmCols::<Val<SC>>::blank_row(&air).flatten(); diff]
+                .into_iter()
+                .flatten(),
+        );
 
         AirProofInput::simple_no_pis(
             Arc::new(air.clone()),
diff --git a/docs/crates/README.md b/docs/crates/README.md
index 5306cb62b6..1ea30bfdb1 100644
--- a/docs/crates/README.md
+++ b/docs/crates/README.md
@@ -2,7 +2,8 @@
 
 Code-level guides to the crates in the repository.
 
-- [`openvm-stark-backend`](./stark.md): Proof system backend
 - `openvm-circuit`
   - [VM Architecture and Chips](./vm.md)
   - [VM Extensions](./vm-extensions.md)
+- `openvm-benchmarks`
+  - [Running Benchmarks](./benchmarks.md)
diff --git a/docs/crates/benchmarks.md b/docs/crates/benchmarks.md
index 8c1a4d8a8a..150bf00002 100644
--- a/docs/crates/benchmarks.md
+++ b/docs/crates/benchmarks.md
@@ -3,19 +3,15 @@
 To run benchmarks, install python3 and run (from root of repo):
 
 ```bash
-python ci/scripts/bench.py <name>
+python ci/scripts/bench.py <name> --instance_type <string> --memory_allocator <mimalloc | jemalloc>
 ```
 
-where `<name>` is a benchmark implemented as a rust binary (located in `src/bin` in a crate). Current benchmark options are:
-
-- `verify_fibair`
-- `fibonacci`
-- `regex`
-  in the `benchmarks` crate.
-  The benchmark outputs a JSON of metrics. You can process this into markdown with:
+where `<name>` is a benchmark implemented as a rust binary (located in `src/bin` in the `openvm-benchmarks` crate).
+For local benchmarking, the `--instance_type` flag can take an arbitrary string.
+The benchmark outputs a JSON of metrics. You can process this into markdown with:
 
 ```bash
-python ci/scripts/metric_unify/main.py <path to json>
+python ci/scripts/metric_unify/main.py <path to json> --aggregation-json ci/scripts/metric_unify/aggregation.json <path to metric json>
 ```
 
 Currently the processing is done automatically at the end of `bench.py`. The script automatically detects if you have a previously saved metric file for the same benchmark and includes the diff report in the output.
diff --git a/docs/crates/stark.md b/docs/crates/stark.md
deleted file mode 100644
index 20287a0956..0000000000
--- a/docs/crates/stark.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# STARK Backend
-
-### Traits for Constraints
-
-An AIR in our system represents the set of constraints and metadata necessary to generate and verify a STARK proof. This is implemened through the following set of traits, which are split between core plonky3 and our `stark-backend` crate, which provides:
-
-- the ability to handle logUp / interactions
-- the ability to handle separate cached traces
-
-#### From plonky3
-
-```rust
-pub trait BaseAir<F: Field> {
-    fn width(&self) -> usize;
-}
-
-pub trait Air<AB: AirBuilder>: BaseAir<AB::F> {
-    fn eval(&self, builder: &mut AB);
-}
-
-pub trait AirBuilder {
-    type F: Field; // use for constants
-    type Var: Into<Self::Expr> + Copy + // .. concrete type of row values
-    type Expr: AbstractField + // .. most general expression for a constraint
-}
-```
-
-The way `Air` works is that you always implement `Air<AB>` with respect to "**some** `AirBuilder` with some properties (additional trait bounds)". However in practice we implement `Air` for "**all** `AirBuilder`s with some properties".
-
-The struct implementing `Air` should be **stateless**. The struct should only contain configuration parameters necessary to determine the AIR constraints.
-
-```rust
-pub trait BaseAirWithPublicValues<F>: BaseAir<F> {
-    fn num_public_values(&self) -> usize {
-        0
-    }
-}
-
-// to use default impl:
-impl<F> BaseAirWithPublicValues<F> for MyAir {}
-```
-
-#### From `openvm-stark-backend`
-
-For cached trace support:
-
-```rust
-/// An AIR with 1 or more main trace partitions.
-pub trait PartitionedBaseAir<F>: BaseAir<F> {
-    /// By default, an AIR has no cached main trace.
-    fn cached_main_widths(&self) -> Vec<usize> {
-        vec![]
-    }
-    /// By default, an AIR has only one private main trace.
-    fn common_main_width(&self) -> usize {
-        self.width()
-    }
-}
-
-// to use default impl:
-impl<F> PartitionedBaseAir<F> for MyAir {}
-```
-
-The common main trace is the "usual" main trace. All common main trace across all AIRs are committed into one commitment. Cached main are additional sections of main trace that are committed individually. Cached trace is not used in VM **except** by ProgramAir, where the OpenVM `Program` is committed into a dedicated commitment.
-
-```rust
-pub trait Rap<AB>: Sync
-where
-    AB: PermutationAirBuilder,
-{
-    fn eval(&self, builder: &mut AB);
-}
-```
-
-We auto-implement `Rap<AB>` for any `Air<AB> where AB: InteractionBuilder`. The `Rap` adds in the extension field columns specified by interactions; note that these columns are not specified explicitly in plonky3 trace generation.
-
-![image](../../assets/rap.png)
-
-So when you implement `Air<AB: InteractionBuilder>` you automatically implement `Rap<AB>` **for some** AirBuilder.
-
-The stark-backend uses three different concrete `AirBuilder` implementations:
-
-- `SymbolicRapBuilder<Val<SC>>`
-- `ProverConstraintFolder<'a, SC>`
-- `DebugConstraintBuilder<'a, SC>`
-
-that depend on a `SC: StarkGenericConfig`. The `SC` specifies FRI proof system configuration parameters.
-
-```rust
-pub trait AnyRap<SC: StarkGenericConfig>:
-    Rap<SymbolicRapBuilder<Val<SC>>> // for keygen to extract fixed data about the RAP
-    + for<'a> Rap<ProverConstraintFolder<'a, SC>> // for prover quotient polynomial calculation
-    + for<'a> Rap<DebugConstraintBuilder<'a, SC>> // for debugging
-    + BaseAirWithPublicValues<Val<SC>>
-    + PartitionedBaseAir<Val<SC>> {
-    // ..
-}
-```
-
-This is an **auto-implemented** trait on any struct that implements `Air` for all AirBuilders the backend cares about above, for a **specific** `SC`.
-
-The backend wants to be able to prove multiple different AIRs together. So it must take a bunch of different `dyn AnyRap<SC>`. For some sizing reasons, instead it must take `Arc<dyn AnyRap<SC>>` where `Arc` is a smart pointer to get around lifetimes and cloning issues. It is best to always use `Arc`, don't mix `Arc, Rc, Box` for the above purpose.
-
-### Traits for Trace Generation
-
-To generate a proof, we pair an AIR (represented by `Arc<dyn AnyRap<SC>>`) with a set of methods to generate input traces in the `Chip` trait:
-
-```rust
-pub trait Chip<SC: StarkGenericConfig> {
-    fn air(&self) -> Arc<dyn AnyRap<SC>>;
-
-    /// Generate all necessary input for proving a single AIR.
-    fn generate_air_proof_input(self) -> AirProofInput<SC>;
-    fn generate_air_proof_input_with_id(self, air_id: usize) -> (usize, AirProofInput<SC>) {
-        (air_id, self.generate_air_proof_input())
-    }
-}
-```
-
-The struct implementing `Chip<SC>` is stateful and stores **records**, which are the minimal amount of data necessary to generate the values in the trace matrix. A chip owns exactly one AIR.
-
-- We must have `Chip` generic in `SC` to avoid many issues with returning `Arc<dyn AnyRap<SC>>`.
-- If you have an enum of `Chip`s, you can derive `Chip` on the enum using proc-macro `#[derive(Chip)]` from `afs_derive`. The macro expects the enum to be generic in `<F>`.
-
-#### `StarkGenericConfig`
-
-`StarkGenericConfig` is a complicated trait with deeply nested associated types. There are various typedefs to get associated types out of it. The most important is `Val<SC>`; this is the field `F` you want. Import `Val` from `afs_stark_backend::config::Val`, which is a re-export of `p3_uni_stark::Val`.
-
-Usual way to implement:
-
-```rust
-impl<SC: StarkGenericConfig> Chip<SC> for MyChip<Val<SC>>
-where Val<SC>: PrimeField32 {
-    // ..
-}
-```
-
-If you need `F` for some reason and the above doesn't work, another way is:
-
-```rust
-impl<F, SC: StarkGenericConfig> Chip<SC> for MyChip<F>
-where Domain<SC>: PolynomialSpace<Val = F> {
-    // ..
-}
-```
diff --git a/docs/crates/vm-extensions.md b/docs/crates/vm-extensions.md
index 5ce4ce54a1..ea15304ba7 100644
--- a/docs/crates/vm-extensions.md
+++ b/docs/crates/vm-extensions.md
@@ -12,7 +12,8 @@ pub trait VmExtension<F: PrimeField32> {
 }
 ```
 
-The `VmExtensionTrait` is a way to specify how to construct a collection of chips and all assign opcodes to be handled by them. This data is collected into a `VmInventory` struct, which is returned.
+The `VmExtension` trait is a way to specify how to construct a collection of chips and all assign opcodes to be handled
+by them. This data is collected into a `VmInventory` struct, which is returned.
 
 To handle previous chip dependencies necessary for chip construction and also automatic bus index management, we provide a `VmInventoryBuilder` api.
 
@@ -123,7 +124,7 @@ The macro will also make two big enums: one that is an enum of the `Ext*::Execut
 
 The macro will then generate a `create_chip_complex` function.
 
-For that we need to understand what `VmChipComplex` is: it replaces the role of the previous `VmChipSet` and consists of:
+For that we need to understand what `VmChipComplex` consists of:
 
 - System chips
 - `VmInventory`
@@ -152,19 +153,9 @@ function. What this does in words:
 
 For each extension's inventory generation, the `VmInventoryBuilder` is provided with a view of all current chips already inside the running chip complex. This means the inventory generation process is sequential in the order the extensions are specified, and each extension has borrow access to all chips constructed by any extension before it.
 
-### `VirtualMachine`
-
-The top level structs of `VirtualMachine`, `VmExecutor`, `SegmentExecutor` remain almost entirely the same, but now has `VmConfig` as a generic:
-
-```rust
-pub struct VirtualMachine<SC: StarkGenericConfig, E, VC>;
-```
-
-TODO: discuss usage
-
 ## Examples
 
-The `extensions/` folder contains extensions implementing all non-system functionality via several extensions. For example, the `Rv32I`, `Rv32M`, and `Rv32Io` extensions implement `VmExtension<F>` in [`openvm-rv32im-circuit`](../../extensions/rv32im/circuit/) and correspond to the RISC-V 32-bit base and multiplication instruction sets and an extension for IO, respectively.
+The [`extensions/`](../../extensions/) folder contains extensions implementing all non-system functionality via custom extensions. For example, the `Rv32I`, `Rv32M`, and `Rv32Io` extensions implement `VmExtension<F>` in [`openvm-rv32im-circuit`](../../extensions/rv32im/circuit/) and correspond to the RISC-V 32-bit base and multiplication instruction sets and an extension for IO, respectively.
 
 # Design Choices
 
diff --git a/docs/crates/vm.md b/docs/crates/vm.md
index d60049d8ea..ce873f37bf 100644
--- a/docs/crates/vm.md
+++ b/docs/crates/vm.md
@@ -2,36 +2,40 @@
 
 ### `InstructionExecutor` Trait
 
-We define an **instruction** to be a VM **opcode** combined with the **operands** to the opcode. Running the instrumented runtime for an opcode is encapsulated in the following trait:
+We define an **instruction** to be an **opcode** combined with the **operands** for the opcode. Running the instrumented
+runtime for an opcode is encapsulated in the following trait:
 
 ```rust
 pub trait InstructionExecutor<F> {
-    /// Runtime execution of the instruction, if the instruction is
-    /// owned by the current instance. May internally store records of
-    /// this call for later trace generation.
+    /// Runtime execution of the instruction, if the instruction is owned by the
+    /// current instance. May internally store records of this call for later trace generation.
     fn execute(
         &mut self,
         instruction: Instruction<F>,
-        from_state: ExecutionState<usize>,
-    ) -> Result<ExecutionState<usize>, ExecutionError>;
+        from_state: ExecutionState<u32>,
+  ) -> Result<ExecutionState<u32>>;
 }
 ```
 
+There is a `struct VmOpcode(usize)` to protect the global opcode `usize`, which must be globally unique for each opcode
+supported in a given VM.
+
 ### Chips for Opcode Groups
 
-We divide all opcodes in the VM into groups, each of which is handled by a single **chip**. A chip should be a struct of type `C` and associated Air of type `A` which satisfy the following trait bounds:
+Opcodes are partitioned into groups, each of which is handled by a single **chip**. A chip should be a struct of
+type `C` and associated Air of type `A` which satisfy the following trait bounds:
 
 ```rust
 C: Chip<SC> + InstructionExecutor<F>
 A: Air<AB> + BaseAir<F> + BaseAirWithPublicValues<F>
 ```
 
-Together, these perform the following functionalities:
-
-- **Keygen:** This is done via the `.eval()` function from `Air<AB>`
-- **Trace Generation:** This is done by calling `.execute()` from `InstructionExecutor<F>` which stores execution records and then `generate_air_proof_input()` from `Chip<SC>` which generates the trace using the corresponding records.
+Together, these provide the following functionalities:
 
-There is a `struct VmOpcode(usize)` to protect the global opcode usize.
+- **Keygen:** Performed via the `Air::<AB>::eval()` function.
+- **Trace Generation:** This is done by calling `InstructionExecutor::<F>::execute()` which computes and stores
+  execution records and then `Chip::<SC>::generate_air_proof_input()` which generates the trace using the corresponding
+  records.
 
 ### Phantom Sub-Instructions
 
@@ -53,103 +57,80 @@ pub trait PhantomSubExecutor<F> {
 pub struct PhantomDiscriminant(pub u16);
 ```
 
-The `PhantomChip<F>` maintains a map `FxHashMap<PhantomDiscriminant, Box<dyn PhantomSubExecutor<F>>>` to handle different phantom sub-instructions.
+The `PhantomChip<F>` internally maintains a mapping from `PhantomDiscriminant` to `Box<dyn PhantomSubExecutor<F>>>` to
+handle different phantom sub-instructions.
 
 ### VM Configuration
 
-**This section needs to be updated for extensions.**
-
-Each specific instantiation of a modular VM is defined in the following structs which handle VMs with/without continuations:
+Each specific instantiation of a modular VM is defined by the following struct:
 
 ```rust
-pub struct VirtualMachine<F: PrimeField32> {
-    pub config: VC,
-    /// Streams are shared between `ExecutionSegment`s and within each
-    /// segment shared with any chip(s) that handle hint opcodes
-    streams: Arc<Mutex<Streams<F>>>,
-    initial_memory: Option<Equipartition<F, CHUNK>>,
-}
-
-pub struct SingleSegmentVM<F: PrimeField32> {
-    pub config: VC,
-    _marker: PhantomData<F>,
+pub struct VirtualMachine<SC: StarkGenericConfig, E, VC> {
+  pub engine: E,
+  pub executor: VmExecutor<Val<SC>, VC>,
 }
 ```
 
-The `Streams<F>` holds an `input_stream` and `hint_stream`:
+The engine type `E` should be `openvm_stark_backend::engine::StarkEngine<SC> `and the VM config type `VC` is
+`openvm_circuit::arch::config::VmConfig<Val<SC>>`, shown below.
 
 ```rust
-pub struct Streams<F> {
-    pub input_stream: VecDeque<Vec<F>>,
-    pub hint_stream: VecDeque<F>,
-}
-```
-
-Configuration of opcodes and memory is handled by:
+pub trait VmConfig<F: PrimeField32>: Clone + Serialize + DeserializeOwned {
+  type Executor: InstructionExecutor<F> + AnyEnum + ChipUsageGetter;
+  type Periphery: AnyEnum + ChipUsageGetter;
 
-```rust
-pub struct VC {
-    /// List of all executors except modular executors.
-    pub executors: Vec<ExecutorName>,
-    /// List of all supported modulus
-    pub supported_modulus: Vec<BigUint>,
-
-    pub poseidon2_max_constraint_degree: usize,
-    pub memory_config: MemoryConfig,
-    pub num_public_values: usize,
-    pub max_segment_len: usize,
-    pub collect_metrics: bool,
-}
+  /// Must contain system config
+  fn system(&self) -> &SystemConfig;
+  fn system_mut(&mut self) -> &mut SystemConfig;
 
-pub struct MemoryConfig {
-    pub addr_space_max_bits: usize,
-    pub pointer_max_bits: usize,
-    pub clk_max_bits: usize,
-    pub decomp: usize,
-    pub persistence_type: PersistenceType,
+  fn create_chip_complex(
+    &self,
+  ) -> Result<VmChipComplex<F, Self::Executor, Self::Periphery>, VmInventoryError>;
 }
 ```
 
+A `VmConfig` has two associated types: `Executor` and `Periphery`. The `Executor` is typically an enum over chips that
+are instruction executors, while `Periphery` is an enum for the chips that are not.
+See [VM Extensions](./vm-extensions.md) for more details.
+
 ### ZK Operations for the VM
 
 #### Keygen
 
-TODO: Update for `VmChipComplex`.
+Key generation is computed from the `VmConfig` describing the VM. The `VmConfig` is used to create the `VmChipComplex`,
+which in turn provides the list of AIRs that are used in the proving and verification process.
 
 #### Trace Generation
 
 Trace generation proceeds from:
 
-> `VirtualMachine.execute_and_generate_with_cached_program()`
+> `VirtualMachine::execute_and_generate_with_cached_program()`
 
-with subsets of functionality offered by `.execute()` and `execute_and_generate()`. The following struct tracks each continuation segment:
+with subsets of functionality offered by `VirtualMachine::execute()` and `VirtualMachine::execute_and_generate()`. The
+following struct tracks each continuation segment:
 
 ```rust
-pub struct ExecutionSegment<F: PrimeField32> {
-    pub config: VC,
-    pub chip_set: VmChipSet<F>,
-
-    // The streams should be mutated in serial without thread-safety,
-    // but the `VmCoreChip` trait requires thread-safety.
-    pub streams: Arc<Mutex<Streams<F>>>,
-
-    pub final_memory: Option<Equipartition<F, CHUNK>>,
-
-    pub cycle_tracker: CycleTracker,
-    /// Collected metrics for this segment alone.
-    /// Only collected when `config.collect_metrics` is true.
-    pub(crate) collected_metrics: VmMetrics,
+pub struct ExecutionSegment<F: PrimeField32, VC: VmConfig<F>> {
+  pub chip_complex: VmChipComplex<F, VC::Executor, VC::Periphery>,
+  pub final_memory: Option<Equipartition<F, CHUNK>>,
+  pub air_names: Vec<String>,
+  pub since_last_segment_check: usize,
 }
 ```
 
 This will:
 
-- Split the execution into `ExecutionSegment`s using `ExecutionSegment.execute_from_pc()`, which calls `ExecutionSegment.should_segment()` to segment online. Note that this creates a `VmChipSet` for each segment from `VmConfig.create_chip_set()`, where **each segment contains each chip**. It also passes all streams to all segments and runs the generation in serial.
-- Generate traces for each segment by calling `VmChipSet.generate_proof_input()`, which iterates through all chips in order and calls `generate_proof_input()`.
+- Split the execution into `ExecutionSegment`s using `ExecutionSegment.execute_from_pc()`, which calls
+  `ExecutionSegment.should_segment()` to segment online. Note that this creates a `VmChipComplex` for each segment from
+  `VmConfig.create_chip_set()`, where **each segment contains each chip**. It also passes all streams to all segments
+  and runs the generation in serial.
+- Generate traces for each segment by calling `VmChipSet.generate_proof_input()`, which iterates through all chips in
+  order and calls `generate_proof_input()`.
 
 #### Proof Generation
 
-This is done by calling `StarkEngine.prove()` on `ProofInput<SC>` created from each segment in `generate_proof_input()`. There is no SDK-level API for this in `VirtualMachine` at present.
+Prove generation is performed by calling `StarkEngine.prove()` on `ProofInput<SC>` created from each segment in
+`generate_proof_input()`. There is no SDK-level API for this in `VirtualMachine` at present.
 
 ## VM Integration API
 
@@ -168,14 +149,21 @@ Most chips in the VM satisfy this, with notable exceptions being Keccak and Pose
 - `VmCoreChip<F, I: VmAdapterInterface<F>>`
 - `VmCoreAir<AB, I: VmAdapterInterface<AB::Expr>>`
 
-[!WARNING]
-The word **core** will be banned from usage outside of this context.
+> [!WARNING]
+> The word **core** will be banned from usage outside of this context.
 
-Main idea: each VM chip will be created from an AdapterChip and a CoreChip. Analogously, the VM AIR is created from an AdapterAir and CoreAir so that the columns of the VM AIR are formed by concatenating the columns from the AdapterAir followed by the CoreAir.
+Main idea: each VM chip is created from an `AdapterChip` and a `CoreChip`. Analogously, the VM AIR is created from an
+`AdapterAir` and `CoreAir` so that the columns of the VM AIR are formed by concatenating the columns from the
+`AdapterAir` followed by the `CoreAir`.
 
-The AdapterChip is responsible for all interactions with the VM system: it owns interactions with the memory bus, program bus, execution bus. It will read data from memory and expose the data (but not intermediate pointers, address spaces, etc.) to the CoreChip and then write data provided by the CoreChip back to memory.
+The `AdapterChip` is responsible for all interactions with the VM system: it owns interactions with the memory bus,
+program bus, execution bus. It will read data from memory and expose the data (but not intermediate pointers, address
+spaces, etc.) to the CoreChip and then write data provided by the CoreChip back to memory.
 
-The AdapterAir does not see the CoreAir, but the CoreAir is able to see the AdapterAir, meaning that the same AdapterAir can be used with several CoreAir's. The AdapterInterface provides a way for CoreAir to provide expressions to be included in AdapterAir constraints -- in particular AdapterAir interactions can still involve CoreAir expressions.
+The `AdapterAir` does not see the `CoreAir`, but the `CoreAir` is able to see the `AdapterAir`, meaning that the same
+`AdapterAir`
+can be used with several `CoreAir`'s. The AdapterInterface provides a way for `CoreAir` to provide expressions to be
+included in `AdapterAir` constraints -- in particular `AdapterAir` interactions can still involve `CoreAir` expressions.
 
 Traits with their associated types and functions:
 
@@ -192,24 +180,24 @@ pub trait VmAdapterChip<F: Field> {
     type ReadRecord: Send;
     /// Records generated by adapter after main instruction execution
     type WriteRecord: Send;
-    /// AdapterAir should not have public values
+  /// `AdapterAir` should not have public values
     type Air: BaseAir<F> + Clone;
-    type Interface<T: AbstractField>: VmAdapterInterface<T>;
+  type Interface: VmAdapterInterface<F>;
 
     fn preprocess(
         &mut self,
         memory: &mut MemoryChip<F>,
         instruction: &Instruction<F>,
-    ) -> Result<(Reads<F, Self::Interface<F>>, Self::ReadRecord)>;
+    ) -> Result<(<Self::Interface as VmAdapterInterface<F>>::Reads, Self::ReadRecord)>;
 
     fn postprocess(
         &mut self,
         memory: &mut MemoryChip<F>,
         instruction: &Instruction<F>,
-        from_state: ExecutionState<usize>,
+        from_state: ExecutionState<u32>,
         ctx: AdapterRuntimeContext<F, Self::Interface<F>>,
         read_record: &Self::ReadRecord,
-    ) -> Result<(ExecutionState<usize>, Self::WriteRecord)>;
+    ) -> Result<(ExecutionState<u32>, Self::WriteRecord)>;
 
     /// Populates `row_slice` with values corresponding to `record`.
     /// The provided `row_slice` will have length equal to `self.air().width()`.
@@ -220,7 +208,10 @@ pub trait VmAdapterChip<F: Field> {
         row_slice: &mut [F],
         read_record: Self::ReadRecord,
         write_record: Self::WriteRecord,
+        aux_cols_factory: &MemoryAuxColsFactory<F>,
     );
+
+  fn air(&self) -> &Self::Air;
 }
 
 pub trait VmAdapterAir<AB: AirBuilder>: BaseAir<AB::F> {
@@ -272,7 +263,7 @@ pub struct AdapterRuntimeContext<T, I: VmAdapterInterface<T>> {
     pub writes: I::Writes,
 }
 
-// For passing from CoreAir to AdapterAir with T = AB::Expr
+// For passing from `CoreAir` to `AdapterAir` with T = AB::Expr
 pub struct AdapterAirContext<T, I: VmAdapterInterface<T>> {
     /// Leave as `None` to allow the adapter to decide the `to_pc` automatically.
     pub to_pc: Option<T>,
@@ -282,8 +273,8 @@ pub struct AdapterAirContext<T, I: VmAdapterInterface<T>> {
 }
 ```
 
-[!WARNING]
-You do not need to implement `Air` on the struct you implement `VmAdapterAir` or `VmCoreAir` on.
+> [!WARNING]
+> You do not need to implement `Air` on the struct you implement `VmAdapterAir` or `VmCoreAir` on.
 
 ### Creating a Chip from Adapter and Core
 
@@ -366,4 +357,4 @@ pub struct ImmInstruction<T> {
     pub opcode: T,
     pub imm: T
 }
-```
\ No newline at end of file
+```
diff --git a/docs/specs/aggregation.md b/docs/specs/aggregation.md
deleted file mode 100644
index 4c2c1f83cc..0000000000
--- a/docs/specs/aggregation.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# Aggregation
-
-We describe our strategy for aggregating STARK proofs at a high-level.
-
-<!--Some details are subject to change-->
-
-## Static Aggregation
-
-Assume that we have a static (i.e., known ahead of time) list `allowed_vks` of STARK verifying keys (unique identifiers for STARK circuits).
-
-Suppose we have a variable-length list of proofs `proofs` where `proofs.len()` is independent of `allowed_vks.len()`. The goal is to produce a single STARK proof that asserts that `proofs[i]` verifies with respect verifying key `vk[i]` where `allowed_vks` contains `vk[i]`, for all `i`. Additionally, there should be the optionality to store a commitment to the ordered list of `(hash(vk[i]), public_values[i])` where `public_values[i]` are the public values of proof `i`.
-
-We aggregate `proofs` using a tree-structure. The arity of the tree can be adjusted for performance;
-by default it is 2. The height of the tree is variable and equal to $\lceil \log{n} \rceil$ where $n$ is the number of proofs and the base of logarithm is the arity.
-
-We distinguish between three types of nodes in the tree:
-
-- Leaf
-- Internal
-- Root
-
-Each node of the tree will be a STARK VM circuit, _without continuations_, proving a VM program that runs STARK verification on an `arity` number of proofs. We make the distinction that each type of node in the tree may be a **different** VM circuit, meaning with different chip configurations. All VM circuits must support the opcodes necessary to do STARK verification.
-
-For each node type, a different program is run in the VM circuit:
-
-- Leaf: the program verifies `<=leaf_arity` proofs, where each proof is verified with respect to one of the verification keys in `allowed_vks`. The leaf program will have the proof, public values, and verifying keys of each proof in program memory, and the program can be augmented with additional checks (for example, state transitions checks are necessary for continuations).
-- Internal: the program verifies `<= internal_arity` proofs, where all proofs are verified with respect to the same verifying key. This verifying key is either that of a leaf circuit or that of an internal circuit (the present circuit itself). The circuit cannot know the verifying key of itself, so to avoid a circular dependency, the hash of the verifying key is made a public value.
-- Root: this program _may_ just be the same as the Internal program, but for the purposes of optimizing [on-chain aggregation](#on-chain-aggregation), there is the possiblity for it to be different. The root program verifies `<= root_arity` proofs, where all proofs are of the internal circuit. Note that `root_arity` may be `1`.
-
-### STARK Configurations
-
-Before proceeding, we must discuss the topic of STARK configurations: any STARK proof depends on at least three configuration parameters:
-
-- `F` the base field of the AIRs
-- `EF` the extension field of the AIRs used for challenge values
-- the hash function used for the FRI PCS. This hash function must be able to hash `F` and `EF` elements, where elements can be packed before hashing.
-
-For all Leaf and Internal circuits [above](#static-aggregation), we use an **Inner Config**. Example Inner Configs are:
-
-- `F` is BabyBear, `EF` is quartic extension of BabyBear, hash is BabyBearPoseidon2
-- `F` is BabyBear, `EF` is quartic extension of BabyBear, hash is SHA256
-- `F` is Mersenne31, `EF` is quartic extension of Mersenne31, hash is Mersenne31Poseidon2
-- `F` is Mersenne31, `EF` is quartic extension of Mersenne31, hash is SHA256
-
-We discuss considerations for choice of hash below.
-
-On the other hand, the Root circuit will use an **Outer Config**. Example Outer Configs are:
-
-- `F` is BabyBear, `EF` is quartic extension of BabyBear, hash is BN254FrPoseidon2 (or BN254FrPoseidon1)
-- ~~`F` is BabyBear, `EF` is quartic extension of BabyBear, hash is SHA256~~
-- `F` is BN254Fr, `EF` is BN254Fr, hash is BN254FrPoseidon2 (or BN254FrPoseidon1)
-- ~~`F` is BN254Fr, `EF` is BN254Fr, hash is SHA256~~
-- Analogous configurations with BabyBear replaced with Mersenne31.
-
-To explain, since `31 * 8 < 254`, eight BabyBear field elements can be packed together and embedded (non-algebraically) into a BN254Fr field element. In this way BN254FrPoseidon2 can be used to hash BabyBear elements.
-
-The choice of hash function in the Outer Config only affects what hash must be verified in the Halo2 circuit for on-chain aggregation (see [below](#on-chain-aggregation)). For performance, it is therefore always better to use BN254FrPoseidon2 for the Outer Config.
-
-### On-chain Aggregation
-
-The Root circuit above is the last STARK circuit, whose single proof will in turn verify all initial `proofs`. Due to the size of STARK proofs, for on-chain verification we must wrap this proof inside an elliptic curve based SNARK proof so that the final SNARK proof can be verified on-chain (where on-chain currently means within an Ethereum Virtual Machine).
-
-We create a Halo2 circuit that verifies any proof of the Root STARK circuit. This is a non-universal circuit whose verifying key depends on the specific STARK circuit to be verified. The majority of the verification logic can be code-generated into the `halo2-lib` eDSL which uses a special vertical custom gate specialized for cheap on-chain verification cost. There are two main performance considerations:
-
-#### 1. Hash
-
-To perform FRI verification in the Halo2 circuit, the circuit must constrain calculations of STARK Outer Config hashes. As mentioned above, this hash will be BN254FrPoseidon2. The constraints for this hash can either be implemented directly using the `halo2-base` vertical gate, or with a custom gate. The custom gate will be faster but with higher verification cost. There are two approaches to consider:
-
-Approach A
-
-- Use a single Halo2 circuit with only thinnest `halo2-base` vertical gate to verify the Root STARK circuit proof.
-
-Approach B
-
-- Use a first Halo2 circuit with custom gate for BN254FrPoseidon2 to verify the Root STARK circuit proof.
-- Use a second Halo2 circuit with only the thinnest `halo2-base` vertical gate to verify the previous Halo2 circuit.
-
-Approach B is likely better, provided that the time to generate both proofs is faster than the time to generate the single proof in Approach A.
-
-#### 2. Outer Config Base Field
-
-The Outer Config base field `F` can be either a 31-bit field or BN254Fr.
-
-When `F` is 31-bit field:
-
-- For FRI folding and other arithmetic in STARK verification, the Halo2 circuit must perform BabyBear prime field arithmetic and extension field arithmetic inside the halo2 circuit. These are non-native arithmetic operations.
-
-When `F` is BN254Fr and `EF` is BN254Fr:
-
-- Halo2 circuit only needs to perform native field arithmetic inside the halo2 circuit.
-- The Root STARK circuit must now perform non-native BabyBear field arithmetic and extension field arithmetic inside the STARK to support the verification of the STARKs with the Inner Config. This non-native arithmetic is still expected to be much faster in the STARK than in Halo2, but the added chip complexity may also increase verifier cost in the Halo2 circuit.
-- If the Inner Config hash is BabyBearPoseidon2, now the Root STARK circuit must constrain BabyBearPoseidon2 inside a circuit with base field BN254Fr. This is definitely not efficient. **Therefore it is not possible for the Outer Config base field to be BN254Fr if the Inner Config hash is BabyBearPoseidon2.**
-- This Outer Config is only possible if the Inner Config hash is a hash that does not depend on the native field (e.g., SHA256 or Blake2b or Blake3).
-  - **Observation:** even if the hash used for the Internal circuit is SHA256, the Leaf circuit can still be proven using BabyBearPoseidon2. Likewise, it is even possible to have the Internal circuits use BabyBearPoseidon2 at higher depths in the tree (away from the root). The only requirement is that the last Internal circuit proof, which will be verified by the Root circuit, needs to be proven with SHA256 as the hash.
-
-TODO: to determine which Outer Config is best, we will:
-
-- Instrument the cost of non-native small field arithmetic in the Halo2 circuit.
-- Benchmark an aggregation VM with Inner Config hash BabyBearPoseidon2 proven over BabyBearPoseidon2 versus one with Inner Config hash SHA256 proven over SHA256.
-
-## Dynamic Aggregation
-
-TODO
diff --git a/docs/specs/continuations.md b/docs/specs/continuations.md
index f45c847eaf..65f518fe65 100644
--- a/docs/specs/continuations.md
+++ b/docs/specs/continuations.md
@@ -1,3 +1,201 @@
+# Aggregation
+
+Given the execution segments of a program, each segment will be proven in parallel within a **Application VM** (App VM).
+These proofs are subsequently aggregated into an aggregation tree by a **leaf aggregation
+program**. This segment aggregation program runs inside _a different VM_, referred to as the **Aggregation VM** (Agg
+VM), which operates without continuations enabled.
+
+The aggregation program takes a variable number of consecutive segment proofs and consolidates them into a single proof
+that captures the entire range of segments.
+
+![Aggregation example](../../assets/agg.png)
+
+The following figure shows that the shape of the aggregation tree is not fixed.
+
+![Another aggregation example](../../assets/agg-2.png)
+
+We will now give an overview of the steps of the overall aggregation, starting from the final smart contract verifier
+and going down to the application proof.
+
+## Smart Contract
+
+A smart contract is deployed by on-chain, which provides a function to verify a Halo2 proof.
+
+## Static Verifier Wrapper
+
+The **Static Verifier Wrapper** is a Halo2 SNARK verifier circuit generated by OpenVM. The static verifier
+wrapper is determined by the following parameters:
+
+* Number of public values
+* The Aggregation VM chip constraints (but **not** the App VM chips)
+
+## Continuation Verifier
+
+The continuation verifier is a Halo2 circuit (static verifier) together with some single segment VM circuits (Agg VM).
+The continuation verifier depends on the specific circuit design of the static verifier and Aggregation VM, as well as
+the number of user public values, but it does not depend on the App VM's circuit.
+
+The continuation verifier ensures that a set of ordered App VM segment proofs collectively validates the execution of a
+specific `VmExe` on a specific App VM, with given inputs.
+
+### Static Verifier
+
+The Static Verifier is a Halo2 verifier circuit that validates a Root VM Verifier proof and exposes its public values.
+
+Static Verifier Requirements:
+
+* The height of each trace is fixed.
+* Trace heights are in a descending order.
+
+Public Values Exposed:
+
+* Exe commit encoded in Bn254
+* Leaf commit encoded in Bn254
+* User public values in BabyBear
+
+Parameters (which could result in a different circuit):
+
+* Number of public values (from upper stream)
+* k in Halo2
+* Determines the number of columns of the circuit.
+
+* Number of public values (from upstream)
+* k in Halo2 (determines the number of columns in the circuit)
+* Root VM verifier
+  * VK (including the heights of all traces)
+  * Root verifier program commitment
+
+### Aggregation VM
+
+The Aggregation VM organizes proofs into an aggregation tree, where nodes include:
+
+* Root VM Verifier
+* Internal VM Verifier
+* Leaf VM Verifier
+
+Each node can have an arbitrary number of children, enabling flexible tree structures to optimize for cost reduction
+(more children) or latency reduction (less children) during proving.
+
+### Root VM Verifier
+
+The Root VM Verifier is proven in RootConfig, using commitments via Bn254Poseidon2. All traces are padded to a constant
+height for verification.
+
+The Root VM Verifier verifies 1 or more proofs of:
+
+- Leaf VM Verifier
+- Internal VM Verifier
+
+In practice, Root VM verifier only verifies one proof to guarantee constant heights.
+
+Logical Input:
+
+* Root input
+
+Cached Trace Commit:
+
+* `ProgramAir`: commits the root verifier program
+
+Public values:
+
+* `RootVmVerifierPvs`
+  * Note: exe_commit is the commitment of the executable. The way to compute it can be found here.
+
+Parameters:
+
+* For circuit:
+  * Root VM Config
+* For root verifier program:
+  * Root FRI parameters to compute its commitment
+  * Internal verifier circuit \+ program commitment
+  * Leaf verifier circuit \+ program commitment
+
+### Internal VM Verifier
+
+The Internal VM Verifier validates one or more proofs of:
+
+* Leaf VM Verifier
+* Internal VM Verifier
+
+Logical Input:
+
+* `InternalVmVerifierInput`
+
+Cached Trace Commit:
+
+* `ProgramAir`: commits the internal verifier program. `agg_vm_pk` contains it.
+
+Public values:
+
+* `InternalVmVerifierPvs`
+
+Parameters:
+
+* For circuit:
+  * Internal VM Config
+* For root verifier program:
+  * Internal FRI parameters to compute its commitment
+  * Internal verifier circuit \+ program commitment
+  * Leaf verifier circuit \+ program commitment
+
+### Leaf VM Verifier
+
+Verify 1 or more proofs of:
+
+* segment circuits
+
+Logical Input:
+
+* `LeafVmVerifierInput`
+
+Cached Trace Commit:
+
+* ProgramAir: commits the leaf verifier program. The leaf verifier program commits .
+
+Public values:
+
+* `VmVerifierPvs`
+
+Parameters:
+
+* For circuit:
+  * Leaf VM Config
+* For leaf verifier program:
+  * It’s not a part of the Continuation Verifier because it depends on the VK of the App VM and it doesn’t affect the VK
+    of the static verifier.
+
+### App VM
+
+App VM executes an executable with inputs and returns a list of segment proofs.
+
+## Segment
+
+Logical Input:
+
+* App VM input stream
+
+Cached Trace Commit:
+
+* ProgramAir: commits the program the App VM executed.
+
+Public values:
+
+* `VmConnectorPvs`
+* `MemoryMerklePvs`
+
+User Public Values:
+
+* Up to `num_public_values` public values in a dedicated memory space. These public values are not exposed as public
+  values of segment circuits, but will be exposed by the final proof.
+
+Parameters:
+
+* Number of public values (from upstream)
+* For circuit:
+  * App VM Config
+* For App program:
+  * App FRI parameters to compute its commitment.
+
 # Continuations
 
 Our high-level continuations framework follows previous standard designs (Starkware, Risc0), but uses a novel persistent
@@ -82,9 +280,4 @@ and has the following interactions on the <span style="color:green">MERKLE_BUS</
 - Receive <span style="color:green">**(-1, 0, (as - AS_OFFSET) \* 2^L, node_label, hash_final)**</span>
 
 It receives `values` from the `MEMORY_BUS` and constrains `hash = compress(values, 0)` via the `POSEIDON2_DIRECT_BUS`.
-
-## Aggregation
-
-Given the execution segments of a program, we will prove each segment in a VM segment circuit in parallel. These proofs will then be aggregated in an [aggregation tree](../aggregation.md) by a segment aggregation program. This segment aggregation program will be run inside **a different VM** which **does not** have continuations turned on. The latter VM is called an **Aggregation VM**.
-
-See [Aggregation](../aggregation.md) for more details.
+The aggregation program takes a variable number of consecutive segment proofs and consolidates them into a single proof
diff --git a/extensions/algebra/circuit/src/fp2_extension.rs b/extensions/algebra/circuit/src/fp2_extension.rs
index f6276f0c4f..8942c47867 100644
--- a/extensions/algebra/circuit/src/fp2_extension.rs
+++ b/extensions/algebra/circuit/src/fp2_extension.rs
@@ -19,10 +19,14 @@ use openvm_stark_backend::p3_field::PrimeField32;
 use serde::{Deserialize, Serialize};
 use strum::EnumCount;
 
-use crate::fp2_chip::{Fp2AddSubChip, Fp2MulDivChip};
+use crate::{
+    fp2_chip::{Fp2AddSubChip, Fp2MulDivChip},
+    util::deserialize_vec_biguint_from_str,
+};
 
 #[derive(Clone, Debug, derive_new::new, Serialize, Deserialize)]
 pub struct Fp2Extension {
+    #[serde(deserialize_with = "deserialize_vec_biguint_from_str")]
     pub supported_modulus: Vec<BigUint>,
 }
 
diff --git a/extensions/algebra/circuit/src/lib.rs b/extensions/algebra/circuit/src/lib.rs
index ffddacc61a..7018513dcf 100644
--- a/extensions/algebra/circuit/src/lib.rs
+++ b/extensions/algebra/circuit/src/lib.rs
@@ -1,6 +1,8 @@
 pub mod fp2_chip;
 pub mod modular_chip;
 
+mod util;
+
 mod fp2;
 pub use fp2::*;
 mod modular_extension;
diff --git a/extensions/algebra/circuit/src/modular_extension.rs b/extensions/algebra/circuit/src/modular_extension.rs
index 604a1ea8e2..ccce43ada0 100644
--- a/extensions/algebra/circuit/src/modular_extension.rs
+++ b/extensions/algebra/circuit/src/modular_extension.rs
@@ -20,13 +20,17 @@ use openvm_stark_backend::p3_field::PrimeField32;
 use serde::{Deserialize, Serialize};
 use strum::EnumCount;
 
-use crate::modular_chip::{
-    ModularAddSubChip, ModularAddSubCoreChip, ModularIsEqualChip, ModularIsEqualCoreChip,
-    ModularMulDivChip, ModularMulDivCoreChip,
+use crate::{
+    modular_chip::{
+        ModularAddSubChip, ModularAddSubCoreChip, ModularIsEqualChip, ModularIsEqualCoreChip,
+        ModularMulDivChip, ModularMulDivCoreChip,
+    },
+    util::deserialize_vec_biguint_from_str,
 };
 
 #[derive(Clone, Debug, derive_new::new, Serialize, Deserialize)]
 pub struct ModularExtension {
+    #[serde(deserialize_with = "deserialize_vec_biguint_from_str")]
     pub supported_modulus: Vec<BigUint>,
 }
 
diff --git a/extensions/algebra/circuit/src/util.rs b/extensions/algebra/circuit/src/util.rs
new file mode 100644
index 0000000000..54bdba4348
--- /dev/null
+++ b/extensions/algebra/circuit/src/util.rs
@@ -0,0 +1,16 @@
+use num_bigint_dig::BigUint;
+use serde::Deserialize;
+
+pub(crate) fn deserialize_vec_biguint_from_str<'de, D>(
+    deserializer: D,
+) -> Result<Vec<BigUint>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    let v: Vec<String> = Deserialize::deserialize(deserializer)?;
+    let res = v.into_iter().map(|s| s.parse()).collect::<Vec<_>>();
+    if res.iter().any(|x| x.is_err()) {
+        return Err(serde::de::Error::custom("Failed to parse BigUint"));
+    }
+    Ok(res.into_iter().map(|x| x.unwrap()).collect())
+}
diff --git a/extensions/bigint/circuit/src/extension.rs b/extensions/bigint/circuit/src/extension.rs
index b4d3faf304..be112e7ac3 100644
--- a/extensions/bigint/circuit/src/extension.rs
+++ b/extensions/bigint/circuit/src/extension.rs
@@ -56,17 +56,22 @@ impl Default for Int256Rv32Config {
 
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 pub struct Int256 {
+    #[serde(default = "default_range_tuple_checker_sizes")]
     pub range_tuple_checker_sizes: [u32; 2],
 }
 
 impl Default for Int256 {
     fn default() -> Self {
         Self {
-            range_tuple_checker_sizes: [1 << 8, 32 * (1 << 8)],
+            range_tuple_checker_sizes: default_range_tuple_checker_sizes(),
         }
     }
 }
 
+fn default_range_tuple_checker_sizes() -> [u32; 2] {
+    [1 << 8, 32 * (1 << 8)]
+}
+
 #[derive(ChipUsageGetter, Chip, InstructionExecutor, From, AnyEnum)]
 pub enum Int256Executor<F: PrimeField32> {
     BaseAlu256(Rv32BaseAlu256Chip<F>),
diff --git a/extensions/rv32im/circuit/src/extension.rs b/extensions/rv32im/circuit/src/extension.rs
index 499b61579c..f3e24f7f82 100644
--- a/extensions/rv32im/circuit/src/extension.rs
+++ b/extensions/rv32im/circuit/src/extension.rs
@@ -133,17 +133,22 @@ pub struct Rv32Io;
 /// RISC-V 32-bit Multiplication Extension (RV32M) Extension
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 pub struct Rv32M {
+    #[serde(default = "default_range_tuple_checker_sizes")]
     pub range_tuple_checker_sizes: [u32; 2],
 }
 
 impl Default for Rv32M {
     fn default() -> Self {
         Self {
-            range_tuple_checker_sizes: [1 << 8, 8 * (1 << 8)],
+            range_tuple_checker_sizes: default_range_tuple_checker_sizes(),
         }
     }
 }
 
+fn default_range_tuple_checker_sizes() -> [u32; 2] {
+    [1 << 8, 8 * (1 << 8)]
+}
+
 // ============ Executor and Periphery Enums for Extension ============
 
 /// RISC-V 32-bit Base (RV32I) Instruction Executors