diff --git a/hashes/zkevm/src/keccak/co_circuit/circuit.rs b/hashes/zkevm/src/keccak/coprocessor/circuit.rs
similarity index 96%
rename from hashes/zkevm/src/keccak/co_circuit/circuit.rs
rename to hashes/zkevm/src/keccak/coprocessor/circuit.rs
index f5b3b04e..ea2ff58c 100644
--- a/hashes/zkevm/src/keccak/co_circuit/circuit.rs
+++ b/hashes/zkevm/src/keccak/coprocessor/circuit.rs
@@ -5,13 +5,13 @@ use super::{
     param::*,
 };
 use crate::{
-    keccak::{
-        keccak_packed_multi::get_num_keccak_f, multi_keccak, param::*, KeccakAssignedRow,
+    keccak::native::{
+        keccak_packed_multi::get_num_keccak_f, param::*, witness::multi_keccak, KeccakAssignedRow,
         KeccakCircuitConfig, KeccakConfigParams,
     },
     util::eth_types::Field,
 };
-use getset::Getters;
+use getset::{CopyGetters, Getters};
 use halo2_base::{
     gates::{
         circuit::{builder::BaseCircuitBuilder, BaseCircuitParams, BaseConfig},
@@ -33,6 +33,7 @@ pub struct KeccakCoprocessorCircuit<F: Field> {
     inputs: Vec<Vec<u8>>,
 
     /// Parameters of this circuit. The same parameters always construct the same circuit.
+    #[getset(get = "pub")]
     params: KeccakCoprocessorCircuitParams,
 
     base_circuit_builder: RefCell<BaseCircuitBuilder<F>>,
@@ -40,23 +41,23 @@ pub struct KeccakCoprocessorCircuit<F: Field> {
 }
 
 /// Parameters of KeccakCoprocessorCircuit.
-#[derive(Default, Clone, Getters)]
+#[derive(Default, Clone, CopyGetters)]
 pub struct KeccakCoprocessorCircuitParams {
     /// This circuit has 2^k rows.
-    #[getset(get = "pub")]
+    #[getset(get_copy = "pub")]
     k: usize,
     // Number of unusable rows withhold by Halo2.
-    #[getset(get = "pub")]
+    #[getset(get_copy = "pub")]
     num_unusable_row: usize,
     /// The bits of lookup table for RangeChip.
-    #[getset(get = "pub")]
+    #[getset(get_copy = "pub")]
     lookup_bits: usize,
     /// Max keccak_f this circuits can aceept. The circuit can at most process <capacity> of inputs
     /// with < NUM_BYTES_TO_ABSORB bytes or an input with <capacity> * NUM_BYTES_TO_ABSORB - 1 bytes.
-    #[getset(get = "pub")]
+    #[getset(get_copy = "pub")]
     capacity: usize,
     // If true, publish raw outputs. Otherwise, publish Poseidon commitment of raw outputs.
-    #[getset(get = "pub")]
+    #[getset(get_copy = "pub")]
     publish_raw_outputs: bool,
 
     // Derived parameters of sub-circuits.
@@ -74,7 +75,7 @@ impl KeccakCoprocessorCircuitParams {
     ) -> Self {
         assert!(1 << k > num_unusable_row, "Number of unusable rows must be less than 2^k");
         let max_rows = (1 << k) - num_unusable_row;
-        // Derived from [crate::keccak::keccak_packed_multi::get_keccak_capacity].
+        // Derived from [crate::keccak::native_circuit::keccak_packed_multi::get_keccak_capacity].
         let rows_per_round = max_rows / (capacity * (NUM_ROUNDS + 1) + 1 + NUM_WORDS_TO_ABSORB);
         assert!(rows_per_round > 0, "No enough rows for the speficied capacity");
         let keccak_circuit_params = KeccakConfigParams { k: k as u32, rows_per_round };
@@ -157,7 +158,7 @@ impl<F: Field> Circuit<F> for KeccakCoprocessorCircuit<F> {
 }
 
 /// Witnesses to be exposed as circuit outputs.
-#[derive(Clone)]
+#[derive(Clone, Copy, PartialEq, Debug)]
 pub struct KeccakCircuitOutput<E> {
     /// Key for App circuits to lookup keccak hash.
     pub key: E,
@@ -341,7 +342,6 @@ impl<F: Field> KeccakCoprocessorCircuit<F> {
                 dummy_keccak_hi_witness,
                 loaded_keccak_f.is_final,
             );
-            println!("In circuit: {:?}", key.value());
             circuit_final_outputs.push(KeccakCircuitOutput { key, hash_lo, hash_hi });
         }
         circuit_final_outputs
@@ -388,13 +388,13 @@ impl<F: Field> KeccakCoprocessorCircuit<F> {
 /// Return circuit outputs of the specified Keccak corprocessor circuit for a specified input.
 pub fn multi_inputs_to_circuit_outputs<F: Field>(
     inputs: &[Vec<u8>],
-    params: &KeccakCoprocessorCircuitParams,
+    capacity: usize,
 ) -> Vec<KeccakCircuitOutput<F>> {
     assert!(u128::BITS <= F::CAPACITY);
     let mut outputs =
         inputs.iter().flat_map(|input| input_to_circuit_outputs::<F>(input)).collect_vec();
-    assert!(outputs.len() <= params.capacity);
-    outputs.resize(params.capacity, dummy_circuit_output());
+    assert!(outputs.len() <= capacity);
+    outputs.resize(capacity, dummy_circuit_output());
     outputs
 }
 
diff --git a/hashes/zkevm/src/keccak/co_circuit/encode.rs b/hashes/zkevm/src/keccak/coprocessor/encode.rs
similarity index 99%
rename from hashes/zkevm/src/keccak/co_circuit/encode.rs
rename to hashes/zkevm/src/keccak/coprocessor/encode.rs
index bac2617c..eff9eef3 100644
--- a/hashes/zkevm/src/keccak/co_circuit/encode.rs
+++ b/hashes/zkevm/src/keccak/coprocessor/encode.rs
@@ -6,7 +6,7 @@ use halo2_base::{
 };
 use itertools::Itertools;
 
-use crate::{keccak::param::*, util::eth_types::Field};
+use crate::{keccak::native::param::*, util::eth_types::Field};
 
 use super::{circuit::LoadedKeccakF, param::*};
 
diff --git a/hashes/zkevm/src/keccak/co_circuit/mod.rs b/hashes/zkevm/src/keccak/coprocessor/mod.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/co_circuit/mod.rs
rename to hashes/zkevm/src/keccak/coprocessor/mod.rs
diff --git a/hashes/zkevm/src/keccak/co_circuit/param.rs b/hashes/zkevm/src/keccak/coprocessor/param.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/co_circuit/param.rs
rename to hashes/zkevm/src/keccak/coprocessor/param.rs
diff --git a/hashes/zkevm/src/keccak/coprocessor/tests/circuit.rs b/hashes/zkevm/src/keccak/coprocessor/tests/circuit.rs
new file mode 100644
index 00000000..c93b2078
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/tests/circuit.rs
@@ -0,0 +1,131 @@
+use crate::keccak::coprocessor::circuit::{
+    dummy_circuit_output, input_to_circuit_outputs, multi_inputs_to_circuit_outputs,
+    KeccakCircuitOutput,
+};
+use halo2_base::halo2_proofs::halo2curves::{bn256::Fr, ff::PrimeField};
+use itertools::Itertools;
+use lazy_static::lazy_static;
+
+lazy_static! {
+    static ref OUTPUT_EMPTY: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x54595a1525d3534a,
+            0xf90e160f1b4648ef,
+            0x34d557ddfb89da5d,
+            0x04ffe3d4b8885928,
+        ]),
+        hash_lo: Fr::from_u128(0xe500b653ca82273b7bfad8045d85a470),
+        hash_hi: Fr::from_u128(0xc5d2460186f7233c927e7db2dcc703c0),
+    };
+    static ref OUTPUT_0: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0xc009f26a12e2f494,
+            0xb4a9d43c17609251,
+            0x68068b5344cba120,
+            0x1531327ea92d38ba,
+        ]),
+        hash_lo: Fr::from_u128(0x6612f7b477d66591ff96a9e064bcc98a),
+        hash_hi: Fr::from_u128(0xbc36789e7a1e281436464229828f817d),
+    };
+    static ref OUTPUT_0_135: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x9a88287adab4da1c,
+            0xe9ff61b507cfd8c2,
+            0xdbf697a6a3ad66a1,
+            0x1eb1d5cc8cdd1532,
+        ]),
+        hash_lo: Fr::from_u128(0x290b0e1706f6a82e5a595b9ce9faca62),
+        hash_hi: Fr::from_u128(0xcbdfd9dee5faad3818d6b06f95a219fd),
+    };
+    static ref OUTPUT_0_136: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x39c1a578acb62676,
+            0x0dc19a75e610c062,
+            0x3f158e809150a14a,
+            0x2367059ac8c80538,
+        ]),
+        hash_lo: Fr::from_u128(0xff11fe3e38e17df89cf5d29c7d7f807e),
+        hash_hi: Fr::from_u128(0x7ce759f1ab7f9ce437719970c26b0a66),
+    };
+    static ref OUTPUT_0_200: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x379bfca638552583,
+            0x1bf7bd603adec30e,
+            0x05efe90ad5dbd814,
+            0x053c729cb8908ccb,
+        ]),
+        hash_lo: Fr::from_u128(0xb4543f3d2703c0923c6901c2af57b890),
+        hash_hi: Fr::from_u128(0xbfb0aa97863e797943cf7c33bb7e880b),
+    };
+}
+
+#[test]
+fn test_dummy_circuit_output() {
+    let KeccakCircuitOutput { key, hash_lo, hash_hi } = dummy_circuit_output::<Fr>();
+    assert_eq!(key, OUTPUT_EMPTY.key);
+    assert_eq!(hash_lo, OUTPUT_EMPTY.hash_lo);
+    assert_eq!(hash_hi, OUTPUT_EMPTY.hash_hi);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_empty() {
+    let result = input_to_circuit_outputs::<Fr>(&[]);
+    assert_eq!(result, vec![*OUTPUT_EMPTY]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_1_keccak_f() {
+    let result = input_to_circuit_outputs::<Fr>(&[0]);
+    assert_eq!(result, vec![*OUTPUT_0]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_1_keccak_f_full() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..135).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_0_135]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_2_keccak_f_2nd_empty() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..136).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_EMPTY, *OUTPUT_0_136]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_2_keccak_f() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..200).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_EMPTY, *OUTPUT_0_200]);
+}
+
+#[test]
+fn test_multi_input_to_circuit_outputs() {
+    let results = multi_inputs_to_circuit_outputs::<Fr>(
+        &[(0..135).collect_vec(), (0..200).collect_vec(), vec![], vec![0], (0..136).collect_vec()],
+        10,
+    );
+    assert_eq!(
+        results,
+        vec![
+            *OUTPUT_0_135,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0_200,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0_136,
+            // Padding
+            *OUTPUT_EMPTY,
+            *OUTPUT_EMPTY,
+            *OUTPUT_EMPTY,
+        ]
+    );
+}
+
+#[test]
+#[should_panic]
+fn test_multi_input_to_circuit_outputs_exceed_capacity() {
+    let _ = multi_inputs_to_circuit_outputs::<Fr>(
+        &[(0..135).collect_vec(), (0..200).collect_vec(), vec![], vec![0], (0..136).collect_vec()],
+        2,
+    );
+}
diff --git a/hashes/zkevm/src/keccak/co_circuit/tests/mod.rs b/hashes/zkevm/src/keccak/coprocessor/tests/mod.rs
similarity index 93%
rename from hashes/zkevm/src/keccak/co_circuit/tests/mod.rs
rename to hashes/zkevm/src/keccak/coprocessor/tests/mod.rs
index 8e355dd4..0bfb7cc2 100644
--- a/hashes/zkevm/src/keccak/co_circuit/tests/mod.rs
+++ b/hashes/zkevm/src/keccak/coprocessor/tests/mod.rs
@@ -5,8 +5,11 @@ use super::circuit::{
 use halo2_base::halo2_proofs::{dev::MockProver, halo2curves::bn256::Fr};
 use itertools::Itertools;
 
+#[cfg(test)]
+mod circuit;
+
 #[test]
-fn test() {
+fn test_mock_leaf_circuit() {
     let k: usize = 18;
     let num_unusable_row: usize = 109;
     let lookup_bits: usize = 4;
@@ -30,7 +33,7 @@ fn test() {
         publish_raw_outputs,
     );
     let circuit = KeccakCoprocessorCircuit::<Fr>::new(inputs.clone(), params.clone());
-    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, &params);
+    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, params.capacity());
 
     let instances = vec![
         circuit_outputs.iter().map(|o| o.key).collect_vec(),
diff --git a/hashes/zkevm/src/keccak/mod.rs b/hashes/zkevm/src/keccak/mod.rs
index 468516af..7431a839 100644
--- a/hashes/zkevm/src/keccak/mod.rs
+++ b/hashes/zkevm/src/keccak/mod.rs
@@ -1,1301 +1,4 @@
-use self::{cell_manager::*, keccak_packed_multi::*, param::*, table::*, util::*};
-use super::util::{
-    constraint_builder::BaseConstraintBuilder,
-    eth_types::{self, Field},
-    expression::{and, not, select, Expr},
-};
-use crate::{
-    halo2_proofs::{
-        circuit::{Layouter, Region, Value},
-        halo2curves::ff::PrimeField,
-        plonk::{Column, ConstraintSystem, Error, Expression, Fixed, TableColumn, VirtualCells},
-        poly::Rotation,
-    },
-    util::{
-        expression::{from_bytes, sum},
-        word::{self, Word, WordExpr},
-    },
-};
-use halo2_base::utils::halo2::{raw_assign_advice, raw_assign_fixed};
-use itertools::Itertools;
-use log::{debug, info};
-use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
-use std::marker::PhantomData;
-
-pub mod cell_manager;
 /// Module for coprocessor circuits.
-pub mod co_circuit;
-pub mod keccak_packed_multi;
-pub mod param;
-pub mod table;
-#[cfg(test)]
-mod tests;
-pub mod util;
-
-/// Configuration parameters to define [`KeccakCircuitConfig`]
-#[derive(Copy, Clone, Debug, Default)]
-pub struct KeccakConfigParams {
-    /// The circuit degree, i.e., circuit has 2<sup>k</sup> rows
-    pub k: u32,
-    /// The number of rows to use for each round in the keccak_f permutation
-    pub rows_per_round: usize,
-}
-
-/// KeccakConfig
-#[derive(Clone, Debug)]
-pub struct KeccakCircuitConfig<F> {
-    // Bool. True on 1st row of each round.
-    q_enable: Column<Fixed>,
-    // Bool. True on 1st row.
-    q_first: Column<Fixed>,
-    // Bool. True on 1st row of all rounds except last rounds.
-    q_round: Column<Fixed>,
-    // Bool. True on 1st row of last rounds.
-    q_absorb: Column<Fixed>,
-    // Bool. True on 1st row of last rounds.
-    q_round_last: Column<Fixed>,
-    // Bool. True on 1st row of rounds which might contain inputs.
-    // Note: first NUM_WORDS_TO_ABSORB rounds of each chunk might contain inputs.
-    // It "might" contain inputs because it's possible that a round only have paddings.
-    q_input: Column<Fixed>,
-    // Bool. True on 1st row of all last input round.
-    q_input_last: Column<Fixed>,
-
-    pub keccak_table: KeccakTable,
-
-    cell_manager: CellManager<F>,
-    round_cst: Column<Fixed>,
-    normalize_3: [TableColumn; 2],
-    normalize_4: [TableColumn; 2],
-    normalize_6: [TableColumn; 2],
-    chi_base_table: [TableColumn; 2],
-    pack_table: [TableColumn; 2],
-
-    // config parameters for convenience
-    pub parameters: KeccakConfigParams,
-
-    _marker: PhantomData<F>,
-}
-
-impl<F: Field> KeccakCircuitConfig<F> {
-    /// Return a new KeccakCircuitConfig
-    pub fn new(meta: &mut ConstraintSystem<F>, parameters: KeccakConfigParams) -> Self {
-        let k = parameters.k;
-        let num_rows_per_round = parameters.rows_per_round;
-
-        let q_enable = meta.fixed_column();
-        let q_first = meta.fixed_column();
-        let q_round = meta.fixed_column();
-        let q_absorb = meta.fixed_column();
-        let q_round_last = meta.fixed_column();
-        let q_input = meta.fixed_column();
-        let q_input_last = meta.fixed_column();
-        let round_cst = meta.fixed_column();
-        let keccak_table = KeccakTable::construct(meta);
-
-        let is_final = keccak_table.is_enabled;
-        let hash_word = keccak_table.output;
-
-        let normalize_3 = array_init::array_init(|_| meta.lookup_table_column());
-        let normalize_4 = array_init::array_init(|_| meta.lookup_table_column());
-        let normalize_6 = array_init::array_init(|_| meta.lookup_table_column());
-        let chi_base_table = array_init::array_init(|_| meta.lookup_table_column());
-        let pack_table = array_init::array_init(|_| meta.lookup_table_column());
-
-        let mut cell_manager = CellManager::new(num_rows_per_round);
-        let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-        let mut total_lookup_counter = 0;
-
-        let start_new_hash = |meta: &mut VirtualCells<F>, rot| {
-            // A new hash is started when the previous hash is done or on the first row
-            meta.query_fixed(q_first, rot) + meta.query_advice(is_final, rot)
-        };
-
-        // Round constant
-        let mut round_cst_expr = 0.expr();
-        meta.create_gate("Query round cst", |meta| {
-            round_cst_expr = meta.query_fixed(round_cst, Rotation::cur());
-            vec![0u64.expr()]
-        });
-        // State data
-        let mut s = vec![vec![0u64.expr(); 5]; 5];
-        let mut s_next = vec![vec![0u64.expr(); 5]; 5];
-        for i in 0..5 {
-            for j in 0..5 {
-                let cell = cell_manager.query_cell(meta);
-                s[i][j] = cell.expr();
-                s_next[i][j] = cell.at_offset(meta, num_rows_per_round as i32).expr();
-            }
-        }
-        // Absorb data
-        let absorb_from = cell_manager.query_cell(meta);
-        let absorb_data = cell_manager.query_cell(meta);
-        let absorb_result = cell_manager.query_cell(meta);
-        let mut absorb_from_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        let mut absorb_data_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        let mut absorb_result_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        for i in 0..NUM_WORDS_TO_ABSORB {
-            let rot = ((i + 1) * num_rows_per_round) as i32;
-            absorb_from_next[i] = absorb_from.at_offset(meta, rot).expr();
-            absorb_data_next[i] = absorb_data.at_offset(meta, rot).expr();
-            absorb_result_next[i] = absorb_result.at_offset(meta, rot).expr();
-        }
-
-        // Store the pre-state
-        let pre_s = s.clone();
-
-        // Absorb
-        // The absorption happening at the start of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 17 of the 24 rounds) a
-        // single word is absorbed so the work is spread out. The absorption is
-        // done simply by doing state + data and then normalizing the result to [0,1].
-        // We also need to convert the input data into bytes to calculate the input data
-        // rlc.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size = get_num_bits_per_absorb_lookup(k);
-        let input = absorb_from.expr() + absorb_data.expr();
-        let absorb_fat =
-            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
-        cell_manager.start_region();
-        let absorb_res = transform::expr(
-            "absorb",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            absorb_fat,
-            normalize_3,
-            true,
-        );
-        cb.require_equal("absorb result", decode::expr(absorb_res), absorb_result.expr());
-        info!("- Post absorb:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Squeeze
-        // The squeezing happening at the end of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
-        // single word is converted to bytes.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        // Potential optimization: could do multiple bytes per lookup
-        let packed_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, absorb_data.expr(), 0, 8, false, None);
-        cell_manager.start_region();
-        // input_bytes.len() = packed_parts.len() = 64 / 8 = 8 = NUM_BYTES_PER_WORD
-        let input_bytes = transform::expr(
-            "squeeze unpack",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            packed_parts,
-            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
-            true,
-        );
-        debug_assert_eq!(input_bytes.len(), NUM_BYTES_PER_WORD);
-
-        // Padding data
-        cell_manager.start_region();
-        let is_paddings = input_bytes.iter().map(|_| cell_manager.query_cell(meta)).collect_vec();
-        info!("- Post padding:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Theta
-        // Calculate
-        // - `c[i] = s[i][0] + s[i][1] + s[i][2] + s[i][3] + s[i][4]`
-        // - `bc[i] = normalize(c)`.
-        // - `t[i] = bc[(i + 4) % 5] + rot(bc[(i + 1)% 5], 1)`
-        // This is done by splitting the bc values in parts in a way
-        // that allows us to also calculate the rotated value "for free".
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size_c = get_num_bits_per_theta_c_lookup(k);
-        let mut c_parts = Vec::new();
-        for s in s.iter() {
-            // Calculate c and split into parts
-            let c = s[0].clone() + s[1].clone() + s[2].clone() + s[3].clone() + s[4].clone();
-            c_parts.push(split::expr(
-                meta,
-                &mut cell_manager,
-                &mut cb,
-                c,
-                1,
-                part_size_c,
-                false,
-                None,
-            ));
-        }
-        // Now calculate `bc` by normalizing `c`
-        cell_manager.start_region();
-        let mut bc = Vec::new();
-        for c in c_parts {
-            // Normalize c
-            bc.push(transform::expr(
-                "theta c",
-                meta,
-                &mut cell_manager,
-                &mut lookup_counter,
-                c,
-                normalize_6,
-                true,
-            ));
-        }
-        // Now do `bc[(i + 4) % 5] + rot(bc[(i + 1) % 5], 1)` using just expressions.
-        // We don't normalize the result here. We do it as part of the rho/pi step, even
-        // though we would only have to normalize 5 values instead of 25, because of the
-        // way the rho/pi and chi steps can be combined it's more efficient to
-        // do it there (the max value for chi is 4 already so that's the
-        // limiting factor).
-        let mut os = vec![vec![0u64.expr(); 5]; 5];
-        for i in 0..5 {
-            let t = decode::expr(bc[(i + 4) % 5].clone())
-                + decode::expr(rotate(bc[(i + 1) % 5].clone(), 1, part_size_c));
-            for j in 0..5 {
-                os[i][j] = s[i][j].clone() + t.clone();
-            }
-        }
-        s = os.clone();
-        info!("- Post theta:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Rho/Pi
-        // For the rotation of rho/pi we split up the words like expected, but in a way
-        // that allows reusing the same parts in an optimal way for the chi step.
-        // We can save quite a few columns by not recombining the parts after rho/pi and
-        // re-splitting the words again before chi. Instead we do chi directly
-        // on the output parts of rho/pi. For rho/pi specically we do
-        // `s[j][2 * i + 3 * j) % 5] = normalize(rot(s[i][j], RHOM[i][j]))`.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size = get_num_bits_per_base_chi_lookup(k);
-        // To combine the rho/pi/chi steps we have to ensure a specific layout so
-        // query those cells here first.
-        // For chi we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) & s[(i+2)%5][j])`. `j`
-        // remains static but `i` is accessed in a wrap around manner. To do this using
-        // multiple rows with lookups in a way that doesn't require any
-        // extra additional cells or selectors we have to put all `s[i]`'s on the same
-        // row. This isn't that strong of a requirement actually because we the
-        // words are split into multipe parts, and so only the parts at the same
-        // position of those words need to be on the same row.
-        let target_word_sizes = target_part_sizes(part_size);
-        let num_word_parts = target_word_sizes.len();
-        let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] = array_init::array_init(|_| {
-            array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
-        });
-        let mut num_columns = 0;
-        let mut column_starts = [0usize; 3];
-        for p in 0..3 {
-            column_starts[p] = cell_manager.start_region();
-            let mut row_idx = 0;
-            num_columns = 0;
-            for j in 0..5 {
-                for _ in 0..num_word_parts {
-                    for i in 0..5 {
-                        rho_pi_chi_cells[p][i][j]
-                            .push(cell_manager.query_cell_at_row(meta, row_idx));
-                    }
-                    if row_idx == 0 {
-                        num_columns += 1;
-                    }
-                    row_idx = (((row_idx as usize) + 1) % num_rows_per_round) as i32;
-                }
-            }
-        }
-        // Do the transformation, resulting in the word parts also being normalized.
-        let pi_region_start = cell_manager.start_region();
-        let mut os_parts = vec![vec![Vec::new(); 5]; 5];
-        for (j, os_part) in os_parts.iter_mut().enumerate() {
-            for i in 0..5 {
-                // Split s into parts
-                let s_parts = split_uniform::expr(
-                    meta,
-                    &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
-                    &mut cell_manager,
-                    &mut cb,
-                    s[i][j].clone(),
-                    RHO_MATRIX[i][j],
-                    part_size,
-                    true,
-                );
-                // Normalize the data to the target cells
-                let s_parts = transform_to::expr(
-                    "rho/pi",
-                    meta,
-                    &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
-                    &mut lookup_counter,
-                    s_parts.clone(),
-                    normalize_4,
-                    true,
-                );
-                os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
-            }
-        }
-        let pi_region_end = cell_manager.start_region();
-        // Pi parts range checks
-        // To make the uniform stuff work we had to combine some parts together
-        // in new cells (see split_uniform). Here we make sure those parts are range
-        // checked. Potential improvement: Could combine multiple smaller parts
-        // in a single lookup but doesn't save that much.
-        for c in pi_region_start..pi_region_end {
-            meta.lookup("pi part range check", |_| {
-                vec![(cell_manager.columns()[c].expr.clone(), normalize_4[0])]
-            });
-            lookup_counter += 1;
-        }
-        info!("- Post rho/pi:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Chi
-        // In groups of 5 columns, we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) &
-        // s[(i+2)%5][j])` five times, on each row (no selector needed).
-        // This is calculated by making use of `CHI_BASE_LOOKUP_TABLE`.
-        let mut lookup_counter = 0;
-        let part_size_base = get_num_bits_per_base_chi_lookup(k);
-        for idx in 0..num_columns {
-            // First fetch the cells we wan to use
-            let mut input: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
-            let mut output: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
-            for c in 0..5 {
-                input[c] = cell_manager.columns()[column_starts[1] + idx * 5 + c].expr.clone();
-                output[c] = cell_manager.columns()[column_starts[2] + idx * 5 + c].expr.clone();
-            }
-            // Now calculate `a ^ ((~b) & c)` by doing `lookup[3 - 2*a + b - c]`
-            for i in 0..5 {
-                let input = scatter::expr(3, part_size_base) - 2.expr() * input[i].clone()
-                    + input[(i + 1) % 5].clone()
-                    - input[(i + 2) % 5].clone();
-                let output = output[i].clone();
-                meta.lookup("chi base", |_| {
-                    vec![(input.clone(), chi_base_table[0]), (output.clone(), chi_base_table[1])]
-                });
-                lookup_counter += 1;
-            }
-        }
-        // Now just decode the parts after the chi transformation done with the lookups
-        // above.
-        let mut os = vec![vec![0u64.expr(); 5]; 5];
-        for (i, os) in os.iter_mut().enumerate() {
-            for (j, os) in os.iter_mut().enumerate() {
-                let mut parts = Vec::new();
-                for idx in 0..num_word_parts {
-                    parts.push(Part {
-                        num_bits: part_size_base,
-                        cell: rho_pi_chi_cells[2][i][j][idx].clone(),
-                        expr: rho_pi_chi_cells[2][i][j][idx].expr(),
-                    });
-                }
-                *os = decode::expr(parts);
-            }
-        }
-        s = os.clone();
-
-        // iota
-        // Simply do the single xor on state [0][0].
-        cell_manager.start_region();
-        let part_size = get_num_bits_per_absorb_lookup(k);
-        let input = s[0][0].clone() + round_cst_expr.clone();
-        let iota_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
-        cell_manager.start_region();
-        // Could share columns with absorb which may end up using 1 lookup/column
-        // fewer...
-        s[0][0] = decode::expr(transform::expr(
-            "iota",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            iota_parts,
-            normalize_3,
-            true,
-        ));
-        // Final results stored in the next row
-        for i in 0..5 {
-            for j in 0..5 {
-                cb.require_equal("next row check", s[i][j].clone(), s_next[i][j].clone());
-            }
-        }
-        info!("- Post chi:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        let mut lookup_counter = 0;
-        cell_manager.start_region();
-
-        // Squeeze data
-        let squeeze_from = cell_manager.query_cell(meta);
-        let mut squeeze_from_prev = vec![0u64.expr(); NUM_WORDS_TO_SQUEEZE];
-        for (idx, squeeze_from_prev) in squeeze_from_prev.iter_mut().enumerate() {
-            let rot = (-(idx as i32) - 1) * num_rows_per_round as i32;
-            *squeeze_from_prev = squeeze_from.at_offset(meta, rot).expr();
-        }
-        // Squeeze
-        // The squeeze happening at the end of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
-        // single word is converted to bytes.
-        // Potential optimization: could do multiple bytes per lookup
-        cell_manager.start_region();
-        // Unpack a single word into bytes (for the squeeze)
-        // Potential optimization: could do multiple bytes per lookup
-        let squeeze_from_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, squeeze_from.expr(), 0, 8, false, None);
-        cell_manager.start_region();
-        let squeeze_bytes = transform::expr(
-            "squeeze unpack",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            squeeze_from_parts,
-            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
-            true,
-        );
-        info!("- Post squeeze:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // The round constraints that we've been building up till now
-        meta.create_gate("round", |meta| cb.gate(meta.query_fixed(q_round, Rotation::cur())));
-
-        // Absorb
-        meta.create_gate("absorb", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let continue_hash = not::expr(start_new_hash(meta, Rotation::cur()));
-            let absorb_positions = get_absorb_positions();
-            let mut a_slice = 0;
-            for j in 0..5 {
-                for i in 0..5 {
-                    if absorb_positions.contains(&(i, j)) {
-                        cb.condition(continue_hash.clone(), |cb| {
-                            cb.require_equal(
-                                "absorb verify input",
-                                absorb_from_next[a_slice].clone(),
-                                pre_s[i][j].clone(),
-                            );
-                        });
-                        cb.require_equal(
-                            "absorb result copy",
-                            select::expr(
-                                continue_hash.clone(),
-                                absorb_result_next[a_slice].clone(),
-                                absorb_data_next[a_slice].clone(),
-                            ),
-                            s_next[i][j].clone(),
-                        );
-                        a_slice += 1;
-                    } else {
-                        cb.require_equal(
-                            "absorb state copy",
-                            pre_s[i][j].clone() * continue_hash.clone(),
-                            s_next[i][j].clone(),
-                        );
-                    }
-                }
-            }
-            cb.gate(meta.query_fixed(q_absorb, Rotation::cur()))
-        });
-
-        // Collect the bytes that are spread out over previous rows
-        let mut hash_bytes = Vec::new();
-        for i in 0..NUM_WORDS_TO_SQUEEZE {
-            for byte in squeeze_bytes.iter() {
-                let rot = (-(i as i32) - 1) * num_rows_per_round as i32;
-                hash_bytes.push(byte.cell.at_offset(meta, rot).expr());
-            }
-        }
-
-        // Squeeze
-        meta.create_gate("squeeze", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let start_new_hash = start_new_hash(meta, Rotation::cur());
-            // The words to squeeze
-            let hash_words: Vec<_> =
-                pre_s.into_iter().take(4).map(|a| a[0].clone()).take(4).collect();
-            // Verify if we converted the correct words to bytes on previous rows
-            for (idx, word) in hash_words.iter().enumerate() {
-                cb.condition(start_new_hash.clone(), |cb| {
-                    cb.require_equal(
-                        "squeeze verify packed",
-                        word.clone(),
-                        squeeze_from_prev[idx].clone(),
-                    );
-                });
-            }
-
-            let hash_bytes_le = hash_bytes.into_iter().rev().collect::<Vec<_>>();
-            cb.condition(start_new_hash, |cb| {
-                cb.require_equal_word(
-                    "output check",
-                    word::Word32::new(hash_bytes_le.try_into().expect("32 limbs")).to_word(),
-                    hash_word.map(|col| meta.query_advice(col, Rotation::cur())),
-                );
-            });
-            cb.gate(meta.query_fixed(q_round_last, Rotation::cur()))
-        });
-
-        // Some general input checks
-        meta.create_gate("input checks", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            cb.require_boolean("boolean is_final", meta.query_advice(is_final, Rotation::cur()));
-            cb.gate(meta.query_fixed(q_enable, Rotation::cur()))
-        });
-
-        // Enforce fixed values on the first row
-        meta.create_gate("first row", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            cb.require_zero(
-                "is_final needs to be disabled on the first row",
-                meta.query_advice(is_final, Rotation::cur()),
-            );
-            cb.gate(meta.query_fixed(q_first, Rotation::cur()))
-        });
-
-        // some utility query functions
-        let q = |col: Column<Fixed>, meta: &mut VirtualCells<'_, F>| {
-            meta.query_fixed(col, Rotation::cur())
-        };
-        /*
-        eg：
-            data:
-                get_num_rows_per_round: 18
-                input: "12345678abc"
-            table:
-                Note[1]: be careful: is_paddings is not column here! It is [Cell; 8] and it will be constrained later.
-                Note[2]: only first row of each round has constraints on bytes_left. This example just shows how witnesses are filled.
-        offset word_value bytes_left  is_paddings q_enable q_input_last
-        18     0x87654321    11          0         1        0 // 1st round begin
-        19        0          10          0         0        0
-        20        0          9           0         0        0
-        21        0          8           0         0        0
-        22        0          7           0         0        0
-        23        0          6           0         0        0
-        24        0          5           0         0        0
-        25        0          4           0         0        0
-        26        0          4           NA        0        0
-        ...
-        35        0          4           NA        0        0  // 1st round end
-        36      0xcba        3           0         1        1  // 2nd round begin
-        37        0          2           0         0        0
-        38        0          1           0         0        0
-        39        0          0           1         0        0
-        40        0          0           1         0        0
-        41        0          0           1         0        0
-        42        0          0           1         0        0
-        43        0          0           1         0        0
-        */
-
-        meta.create_gate("word_value", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let masked_input_bytes = input_bytes
-                .iter()
-                .zip(is_paddings.clone())
-                .map(|(input_byte, is_padding)| {
-                    input_byte.expr.clone() * not::expr(is_padding.expr().clone())
-                })
-                .collect_vec();
-            let input_word = from_bytes::expr(&masked_input_bytes);
-            cb.require_equal(
-                "word value",
-                input_word,
-                meta.query_advice(keccak_table.word_value, Rotation::cur()),
-            );
-            cb.gate(q(q_input, meta))
-        });
-        meta.create_gate("bytes_left", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let bytes_left_expr = meta.query_advice(keccak_table.bytes_left, Rotation::cur());
-
-            // bytes_left is 0 in the absolute first `rows_per_round` of the entire circuit, i.e., the first dummy round.
-            cb.condition(q(q_first, meta), |cb| {
-                cb.require_zero(
-                    "bytes_left needs to be zero on the absolute first dummy round",
-                    meta.query_advice(keccak_table.bytes_left, Rotation::cur()),
-                );
-            });
-            // is_final ==> bytes_left == 0.
-            // Note: is_final = true only in the last round, which doesn't have any data to absorb.
-            cb.condition(meta.query_advice(is_final, Rotation::cur()), |cb| {
-                cb.require_zero("bytes_left should be 0 when is_final", bytes_left_expr.clone());
-            });
-            //q_input[cur] ==> bytes_left[cur + num_rows_per_round] + word_len == bytes_left[cur]
-            cb.condition(q(q_input, meta), |cb| {
-                // word_len = NUM_BYTES_PER_WORD - sum(is_paddings)
-                let word_len = NUM_BYTES_PER_WORD.expr() - sum::expr(is_paddings.clone());
-                let bytes_left_next_expr =
-                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
-                cb.require_equal(
-                    "if there is a word in this round, bytes_left[curr + num_rows_per_round] + word_len == bytes_left[curr]",
-                    bytes_left_expr.clone(),
-                    bytes_left_next_expr + word_len,
-                );
-            });
-            // !q_input[cur] && !start_new_hash(cur) ==> bytes_left[cur + num_rows_per_round] == bytes_left[cur]
-            // !q_input[cur] && !start_new_hash(cur) === !(q_input[cur] || start_new_hash(cur))
-            // Because q_input[cur] and start_new_hash(cur) are never both true at the same time, we use + instead of or in order to save a degree.
-            cb.condition(not::expr(q(q_input, meta) + start_new_hash(meta, Rotation::cur())), |cb| {
-                let bytes_left_next_expr =
-                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
-                cb.require_equal(
-                    "if no input and not starting new hash, bytes_left should keep the same",
-                    bytes_left_expr,
-                    bytes_left_next_expr,
-                );
-            });
-
-            cb.gate(q(q_enable, meta))
-        });
-
-        // Enforce logic for when this block is the last block for a hash
-        let last_is_padding_in_block = is_paddings.last().unwrap().at_offset(
-            meta,
-            -(((NUM_ROUNDS + 1 - NUM_WORDS_TO_ABSORB) * num_rows_per_round) as i32),
-        );
-        meta.create_gate("is final", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            // All absorb rows except the first row
-            cb.condition(
-                meta.query_fixed(q_absorb, Rotation::cur())
-                    - meta.query_fixed(q_first, Rotation::cur()),
-                |cb| {
-                    cb.require_equal(
-                        "is_final needs to be the same as the last is_padding in the block",
-                        meta.query_advice(is_final, Rotation::cur()),
-                        last_is_padding_in_block.expr(),
-                    );
-                },
-            );
-            // For all the rows of a round, only the first row can have `is_final == 1`.
-            cb.condition(
-                (1..num_rows_per_round as i32)
-                    .map(|i| meta.query_fixed(q_enable, Rotation(-i)))
-                    .fold(0.expr(), |acc, elem| acc + elem),
-                |cb| {
-                    cb.require_zero(
-                        "is_final only when q_enable",
-                        meta.query_advice(is_final, Rotation::cur()),
-                    );
-                },
-            );
-            cb.gate(1.expr())
-        });
-
-        // Padding
-        // May be cleaner to do this padding logic in the byte conversion lookup but
-        // currently easier to do it like this.
-        let prev_is_padding =
-            is_paddings.last().unwrap().at_offset(meta, -(num_rows_per_round as i32));
-        meta.create_gate("padding", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let q_input = meta.query_fixed(q_input, Rotation::cur());
-            let q_input_last = meta.query_fixed(q_input_last, Rotation::cur());
-
-            // All padding selectors need to be boolean
-            for is_padding in is_paddings.iter() {
-                cb.condition(meta.query_fixed(q_enable, Rotation::cur()), |cb| {
-                    cb.require_boolean("is_padding boolean", is_padding.expr());
-                });
-            }
-            // This last padding selector will be used on the first round row so needs to be
-            // zero
-            cb.condition(meta.query_fixed(q_absorb, Rotation::cur()), |cb| {
-                cb.require_zero(
-                    "last is_padding should be zero on absorb rows",
-                    is_paddings.last().unwrap().expr(),
-                );
-            });
-            // Now for each padding selector
-            for idx in 0..is_paddings.len() {
-                // Previous padding selector can be on the previous row
-                let is_padding_prev =
-                    if idx == 0 { prev_is_padding.expr() } else { is_paddings[idx - 1].expr() };
-                let is_first_padding = is_paddings[idx].expr() - is_padding_prev.clone();
-
-                // Check padding transition 0 -> 1 done only once
-                cb.condition(q_input.expr(), |cb| {
-                    cb.require_boolean("padding step boolean", is_first_padding.clone());
-                });
-
-                // Padding start/intermediate/end byte checks
-                if idx == is_paddings.len() - 1 {
-                    // These can be combined in the future, but currently this would increase the
-                    // degree by one Padding start/intermediate byte, all
-                    // padding rows except the last one
-                    cb.condition(
-                        and::expr([q_input.expr() - q_input_last.expr(), is_paddings[idx].expr()]),
-                        |cb| {
-                            // Input bytes need to be zero, or one if this is the first padding byte
-                            cb.require_equal(
-                                "padding start/intermediate byte last byte",
-                                input_bytes[idx].expr.clone(),
-                                is_first_padding.expr(),
-                            );
-                        },
-                    );
-                    // Padding start/end byte, only on the last padding row
-                    cb.condition(and::expr([q_input_last.expr(), is_paddings[idx].expr()]), |cb| {
-                        // The input byte needs to be 128, unless it's also the first padding
-                        // byte then it's 129
-                        cb.require_equal(
-                            "padding start/end byte",
-                            input_bytes[idx].expr.clone(),
-                            is_first_padding.expr() + 128.expr(),
-                        );
-                    });
-                } else {
-                    // Padding start/intermediate byte
-                    cb.condition(and::expr([q_input.expr(), is_paddings[idx].expr()]), |cb| {
-                        // Input bytes need to be zero, or one if this is the first padding byte
-                        cb.require_equal(
-                            "padding start/intermediate byte",
-                            input_bytes[idx].expr.clone(),
-                            is_first_padding.expr(),
-                        );
-                    });
-                }
-            }
-            cb.gate(1.expr())
-        });
-
-        info!("Degree: {}", meta.degree());
-        info!("Minimum rows: {}", meta.minimum_rows());
-        info!("Total Lookups: {}", total_lookup_counter);
-        #[cfg(feature = "display")]
-        {
-            println!("Total Keccak Columns: {}", cell_manager.get_width());
-            std::env::set_var("KECCAK_ADVICE_COLUMNS", cell_manager.get_width().to_string());
-        }
-        #[cfg(not(feature = "display"))]
-        info!("Total Keccak Columns: {}", cell_manager.get_width());
-        info!("num unused cells: {}", cell_manager.get_num_unused_cells());
-        info!("part_size absorb: {}", get_num_bits_per_absorb_lookup(k));
-        info!("part_size theta: {}", get_num_bits_per_theta_c_lookup(k));
-        info!("part_size theta c: {}", get_num_bits_per_lookup(THETA_C_LOOKUP_RANGE, k));
-        info!("part_size theta t: {}", get_num_bits_per_lookup(4, k));
-        info!("part_size rho/pi: {}", get_num_bits_per_rho_pi_lookup(k));
-        info!("part_size chi base: {}", get_num_bits_per_base_chi_lookup(k));
-        info!("uniform part sizes: {:?}", target_part_sizes(get_num_bits_per_theta_c_lookup(k)));
-
-        KeccakCircuitConfig {
-            q_enable,
-            q_first,
-            q_round,
-            q_absorb,
-            q_round_last,
-            q_input,
-            q_input_last,
-            keccak_table,
-            cell_manager,
-            round_cst,
-            normalize_3,
-            normalize_4,
-            normalize_6,
-            chi_base_table,
-            pack_table,
-            parameters,
-            _marker: PhantomData,
-        }
-    }
-}
-
-#[derive(Clone)]
-pub struct KeccakAssignedRow<'v, F: Field> {
-    pub is_final: KeccakAssignedValue<'v, F>,
-    pub hash_lo: KeccakAssignedValue<'v, F>,
-    pub hash_hi: KeccakAssignedValue<'v, F>,
-    pub bytes_left: KeccakAssignedValue<'v, F>,
-    pub word_value: KeccakAssignedValue<'v, F>,
-}
-
-impl<F: Field> KeccakCircuitConfig<F> {
-    /// Returns vector of `is_final`, `length`, `hash.lo`, `hash.hi` for assigned rows
-    pub fn assign<'v>(
-        &self,
-        region: &mut Region<F>,
-        witness: &[KeccakRow<F>],
-    ) -> Vec<KeccakAssignedRow<'v, F>> {
-        witness
-            .iter()
-            .enumerate()
-            .map(|(offset, keccak_row)| self.set_row(region, offset, keccak_row))
-            .collect()
-    }
-
-    /// Output is `is_final`, `length`, `hash.lo`, `hash.hi` at that row
-    pub fn set_row<'v>(
-        &self,
-        region: &mut Region<F>,
-        offset: usize,
-        row: &KeccakRow<F>,
-    ) -> KeccakAssignedRow<'v, F> {
-        // Fixed selectors
-        for (_, column, value) in &[
-            ("q_enable", self.q_enable, F::from(row.q_enable)),
-            ("q_first", self.q_first, F::from(offset == 0)),
-            ("q_round", self.q_round, F::from(row.q_round)),
-            ("q_round_last", self.q_round_last, F::from(row.q_round_last)),
-            ("q_absorb", self.q_absorb, F::from(row.q_absorb)),
-            ("q_input", self.q_input, F::from(row.q_input)),
-            ("q_input_last", self.q_input_last, F::from(row.q_input_last)),
-        ] {
-            raw_assign_fixed(region, *column, offset, *value);
-        }
-
-        // Keccak data
-        let [is_final, hash_lo, hash_hi, bytes_left, word_value] = [
-            ("is_final", self.keccak_table.is_enabled, Value::known(F::from(row.is_final))),
-            ("hash_lo", self.keccak_table.output.lo(), row.hash.lo()),
-            ("hash_hi", self.keccak_table.output.hi(), row.hash.hi()),
-            ("bytes_left", self.keccak_table.bytes_left, Value::known(row.bytes_left)),
-            ("word_value", self.keccak_table.word_value, Value::known(row.word_value)),
-        ]
-        .map(|(_name, column, value)| raw_assign_advice(region, column, offset, value));
-
-        // Cell values
-        row.cell_values.iter().zip(self.cell_manager.columns()).for_each(|(bit, column)| {
-            raw_assign_advice(region, column.advice, offset, Value::known(*bit));
-        });
-
-        // Round constant
-        raw_assign_fixed(region, self.round_cst, offset, row.round_cst);
-
-        KeccakAssignedRow { is_final, hash_lo, hash_hi, bytes_left, word_value }
-    }
-
-    pub fn load_aux_tables(&self, layouter: &mut impl Layouter<F>, k: u32) -> Result<(), Error> {
-        load_normalize_table(layouter, "normalize_6", &self.normalize_6, 6u64, k)?;
-        load_normalize_table(layouter, "normalize_4", &self.normalize_4, 4u64, k)?;
-        load_normalize_table(layouter, "normalize_3", &self.normalize_3, 3u64, k)?;
-        load_lookup_table(
-            layouter,
-            "chi base",
-            &self.chi_base_table,
-            get_num_bits_per_base_chi_lookup(k),
-            &CHI_BASE_LOOKUP_TABLE,
-        )?;
-        load_pack_table(layouter, &self.pack_table)
-    }
-}
-
-/// Witness generation for keccak hash of little-endian `bytes`.
-fn keccak<F: Field>(
-    rows: &mut Vec<KeccakRow<F>>,
-    squeeze_digests: &mut Vec<[F; NUM_WORDS_TO_SQUEEZE]>,
-    bytes: &[u8],
-    parameters: KeccakConfigParams,
-) {
-    let k = parameters.k;
-    let num_rows_per_round = parameters.rows_per_round;
-
-    let mut bits = into_bits(bytes);
-    let mut s = [[F::ZERO; 5]; 5];
-    let absorb_positions = get_absorb_positions();
-    let num_bytes_in_last_block = bytes.len() % RATE;
-    let two = F::from(2u64);
-
-    // Padding
-    bits.push(1);
-    while (bits.len() + 1) % RATE_IN_BITS != 0 {
-        bits.push(0);
-    }
-    bits.push(1);
-
-    // running length of absorbed input in bytes
-    let mut length = 0;
-    let chunks = bits.chunks(RATE_IN_BITS);
-    let num_chunks = chunks.len();
-
-    let mut cell_managers = Vec::with_capacity(NUM_ROUNDS + 1);
-    let mut regions = Vec::with_capacity(NUM_ROUNDS + 1);
-    // keeps track of running lengths over all rounds in an absorb step
-    let mut round_lengths = Vec::with_capacity(NUM_ROUNDS + 1);
-    let mut hash_words = [F::ZERO; NUM_WORDS_TO_SQUEEZE];
-    let mut hash = Word::default();
-
-    for (idx, chunk) in chunks.enumerate() {
-        let is_final_block = idx == num_chunks - 1;
-
-        let mut absorb_rows = Vec::new();
-        // Absorb
-        for (idx, &(i, j)) in absorb_positions.iter().enumerate() {
-            let absorb = pack(&chunk[idx * 64..(idx + 1) * 64]);
-            let from = s[i][j];
-            s[i][j] = field_xor(s[i][j], absorb);
-            absorb_rows.push(AbsorbData { from, absorb, result: s[i][j] });
-        }
-
-        // better memory management to clear already allocated Vecs
-        cell_managers.clear();
-        regions.clear();
-        round_lengths.clear();
-
-        for round in 0..NUM_ROUNDS + 1 {
-            let mut cell_manager = CellManager::new(num_rows_per_round);
-            let mut region = KeccakRegion::new();
-
-            let mut absorb_row = AbsorbData::default();
-            if round < NUM_WORDS_TO_ABSORB {
-                absorb_row = absorb_rows[round].clone();
-            }
-
-            // State data
-            for s in &s {
-                for s in s {
-                    let cell = cell_manager.query_cell_value();
-                    cell.assign(&mut region, 0, *s);
-                }
-            }
-
-            // Absorb data
-            let absorb_from = cell_manager.query_cell_value();
-            let absorb_data = cell_manager.query_cell_value();
-            let absorb_result = cell_manager.query_cell_value();
-            absorb_from.assign(&mut region, 0, absorb_row.from);
-            absorb_data.assign(&mut region, 0, absorb_row.absorb);
-            absorb_result.assign(&mut region, 0, absorb_row.result);
-
-            // Absorb
-            cell_manager.start_region();
-            let part_size = get_num_bits_per_absorb_lookup(k);
-            let input = absorb_row.from + absorb_row.absorb;
-            let absorb_fat =
-                split::value(&mut cell_manager, &mut region, input, 0, part_size, false, None);
-            cell_manager.start_region();
-            let _absorb_result = transform::value(
-                &mut cell_manager,
-                &mut region,
-                absorb_fat.clone(),
-                true,
-                |v| v & 1,
-                true,
-            );
-
-            // Padding
-            cell_manager.start_region();
-            // Unpack a single word into bytes (for the absorption)
-            // Potential optimization: could do multiple bytes per lookup
-            let packed =
-                split::value(&mut cell_manager, &mut region, absorb_row.absorb, 0, 8, false, None);
-            cell_manager.start_region();
-            let input_bytes =
-                transform::value(&mut cell_manager, &mut region, packed, false, |v| *v, true);
-            cell_manager.start_region();
-            let is_paddings =
-                input_bytes.iter().map(|_| cell_manager.query_cell_value()).collect::<Vec<_>>();
-            debug_assert_eq!(is_paddings.len(), NUM_BYTES_PER_WORD);
-            if round < NUM_WORDS_TO_ABSORB {
-                for (padding_idx, is_padding) in is_paddings.iter().enumerate() {
-                    let byte_idx = round * NUM_BYTES_PER_WORD + padding_idx;
-                    let padding = if is_final_block && byte_idx >= num_bytes_in_last_block {
-                        true
-                    } else {
-                        length += 1;
-                        false
-                    };
-                    is_padding.assign(&mut region, 0, F::from(padding));
-                }
-            }
-            cell_manager.start_region();
-
-            if round != NUM_ROUNDS {
-                // Theta
-                let part_size = get_num_bits_per_theta_c_lookup(k);
-                let mut bcf = Vec::new();
-                for s in &s {
-                    let c = s[0] + s[1] + s[2] + s[3] + s[4];
-                    let bc_fat =
-                        split::value(&mut cell_manager, &mut region, c, 1, part_size, false, None);
-                    bcf.push(bc_fat);
-                }
-                cell_manager.start_region();
-                let mut bc = Vec::new();
-                for bc_fat in bcf {
-                    let bc_norm = transform::value(
-                        &mut cell_manager,
-                        &mut region,
-                        bc_fat.clone(),
-                        true,
-                        |v| v & 1,
-                        true,
-                    );
-                    bc.push(bc_norm);
-                }
-                cell_manager.start_region();
-                let mut os = [[F::ZERO; 5]; 5];
-                for i in 0..5 {
-                    let t = decode::value(bc[(i + 4) % 5].clone())
-                        + decode::value(rotate(bc[(i + 1) % 5].clone(), 1, part_size));
-                    for j in 0..5 {
-                        os[i][j] = s[i][j] + t;
-                    }
-                }
-                s = os;
-                cell_manager.start_region();
-
-                // Rho/Pi
-                let part_size = get_num_bits_per_base_chi_lookup(k);
-                let target_word_sizes = target_part_sizes(part_size);
-                let num_word_parts = target_word_sizes.len();
-                let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] =
-                    array_init::array_init(|_| {
-                        array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
-                    });
-                let mut column_starts = [0usize; 3];
-                for p in 0..3 {
-                    column_starts[p] = cell_manager.start_region();
-                    let mut row_idx = 0;
-                    for j in 0..5 {
-                        for _ in 0..num_word_parts {
-                            for i in 0..5 {
-                                rho_pi_chi_cells[p][i][j]
-                                    .push(cell_manager.query_cell_value_at_row(row_idx as i32));
-                            }
-                            row_idx = (row_idx + 1) % num_rows_per_round;
-                        }
-                    }
-                }
-                cell_manager.start_region();
-                let mut os_parts: [[Vec<PartValue<F>>; 5]; 5] =
-                    array_init::array_init(|_| array_init::array_init(|_| Vec::new()));
-                for (j, os_part) in os_parts.iter_mut().enumerate() {
-                    for i in 0..5 {
-                        let s_parts = split_uniform::value(
-                            &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
-                            &mut cell_manager,
-                            &mut region,
-                            s[i][j],
-                            RHO_MATRIX[i][j],
-                            part_size,
-                            true,
-                        );
-
-                        let s_parts = transform_to::value(
-                            &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
-                            &mut region,
-                            s_parts.clone(),
-                            true,
-                            |v| v & 1,
-                        );
-                        os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
-                    }
-                }
-                cell_manager.start_region();
-
-                // Chi
-                let part_size_base = get_num_bits_per_base_chi_lookup(k);
-                let three_packed = pack::<F>(&vec![3u8; part_size_base]);
-                let mut os = [[F::ZERO; 5]; 5];
-                for j in 0..5 {
-                    for i in 0..5 {
-                        let mut s_parts = Vec::new();
-                        for ((part_a, part_b), part_c) in os_parts[i][j]
-                            .iter()
-                            .zip(os_parts[(i + 1) % 5][j].iter())
-                            .zip(os_parts[(i + 2) % 5][j].iter())
-                        {
-                            let value =
-                                three_packed - two * part_a.value + part_b.value - part_c.value;
-                            s_parts.push(PartValue {
-                                num_bits: part_size_base,
-                                rot: j as i32,
-                                value,
-                            });
-                        }
-                        os[i][j] = decode::value(transform_to::value(
-                            &rho_pi_chi_cells[2][i][j],
-                            &mut region,
-                            s_parts.clone(),
-                            true,
-                            |v| CHI_BASE_LOOKUP_TABLE[*v as usize],
-                        ));
-                    }
-                }
-                s = os;
-                cell_manager.start_region();
-
-                // iota
-                let part_size = get_num_bits_per_absorb_lookup(k);
-                let input = s[0][0] + pack_u64::<F>(ROUND_CST[round]);
-                let iota_parts = split::value::<F>(
-                    &mut cell_manager,
-                    &mut region,
-                    input,
-                    0,
-                    part_size,
-                    false,
-                    None,
-                );
-                cell_manager.start_region();
-                s[0][0] = decode::value(transform::value(
-                    &mut cell_manager,
-                    &mut region,
-                    iota_parts.clone(),
-                    true,
-                    |v| v & 1,
-                    true,
-                ));
-            }
-
-            // Assign the hash result
-            let is_final = is_final_block && round == NUM_ROUNDS;
-            hash = if is_final {
-                let hash_bytes_le = s
-                    .into_iter()
-                    .take(4)
-                    .flat_map(|a| to_bytes::value(&unpack(a[0])))
-                    .rev()
-                    .collect::<Vec<_>>();
-
-                let word: Word<Value<F>> =
-                    Word::from(eth_types::Word::from_little_endian(hash_bytes_le.as_slice()))
-                        .map(Value::known);
-                word
-            } else {
-                Word::default().into_value()
-            };
-
-            // The words to squeeze out: this is the hash digest as words with
-            // NUM_BYTES_PER_WORD (=8) bytes each
-            for (hash_word, a) in hash_words.iter_mut().zip(s.iter()) {
-                *hash_word = a[0];
-            }
-
-            round_lengths.push(length);
-
-            cell_managers.push(cell_manager);
-            regions.push(region);
-        }
-
-        // Now that we know the state at the end of the rounds, set the squeeze data
-        let num_rounds = cell_managers.len();
-        for (idx, word) in hash_words.iter().enumerate() {
-            let cell_manager = &mut cell_managers[num_rounds - 2 - idx];
-            let region = &mut regions[num_rounds - 2 - idx];
-
-            cell_manager.start_region();
-            let squeeze_packed = cell_manager.query_cell_value();
-            squeeze_packed.assign(region, 0, *word);
-
-            cell_manager.start_region();
-            let packed = split::value(cell_manager, region, *word, 0, 8, false, None);
-            cell_manager.start_region();
-            transform::value(cell_manager, region, packed, false, |v| *v, true);
-        }
-        squeeze_digests.push(hash_words);
-
-        for round in 0..NUM_ROUNDS + 1 {
-            let round_cst = pack_u64(ROUND_CST[round]);
-
-            for row_idx in 0..num_rows_per_round {
-                let word_value = if round < NUM_WORDS_TO_ABSORB && row_idx == 0 {
-                    let byte_idx = (idx * NUM_WORDS_TO_ABSORB + round) * NUM_BYTES_PER_WORD;
-                    if byte_idx >= bytes.len() {
-                        0
-                    } else {
-                        let end = std::cmp::min(byte_idx + NUM_BYTES_PER_WORD, bytes.len());
-                        let mut word_bytes = bytes[byte_idx..end].to_vec().clone();
-                        word_bytes.resize(NUM_BYTES_PER_WORD, 0);
-                        u64::from_le_bytes(word_bytes.try_into().unwrap())
-                    }
-                } else {
-                    0
-                };
-                let byte_idx = if round < NUM_WORDS_TO_ABSORB {
-                    round * NUM_BYTES_PER_WORD + std::cmp::min(row_idx, NUM_BYTES_PER_WORD - 1)
-                } else {
-                    NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD
-                } + idx * NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD;
-                let bytes_left = if byte_idx >= bytes.len() { 0 } else { bytes.len() - byte_idx };
-                rows.push(KeccakRow {
-                    q_enable: row_idx == 0,
-                    q_round: row_idx == 0 && round < NUM_ROUNDS,
-                    q_absorb: row_idx == 0 && round == NUM_ROUNDS,
-                    q_round_last: row_idx == 0 && round == NUM_ROUNDS,
-                    q_input: row_idx == 0 && round < NUM_WORDS_TO_ABSORB,
-                    q_input_last: row_idx == 0 && round == NUM_WORDS_TO_ABSORB - 1,
-                    round_cst,
-                    is_final: is_final_block && round == NUM_ROUNDS && row_idx == 0,
-                    cell_values: regions[round].rows.get(row_idx).unwrap_or(&vec![]).clone(),
-                    hash,
-                    bytes_left: F::from_u128(bytes_left as u128),
-                    word_value: F::from_u128(word_value as u128),
-                });
-                #[cfg(debug_assertions)]
-                {
-                    let mut r = rows.last().unwrap().clone();
-                    r.cell_values.clear();
-                    log::trace!("offset {:?} row idx {} row {:?}", rows.len() - 1, row_idx, r);
-                }
-            }
-            log::trace!(" = = = = = = round {} end", round);
-        }
-        log::trace!(" ====================== chunk {} end", idx);
-    }
-
-    #[cfg(debug_assertions)]
-    {
-        let hash_bytes = s
-            .into_iter()
-            .take(4)
-            .map(|a| {
-                pack_with_base::<F>(&unpack(a[0]), 2)
-                    .to_bytes_le()
-                    .into_iter()
-                    .take(8)
-                    .collect::<Vec<_>>()
-                    .to_vec()
-            })
-            .collect::<Vec<_>>();
-        debug!("hash: {:x?}", &(hash_bytes[0..4].concat()));
-        assert_eq!(length, bytes.len());
-    }
-}
-
-/// Witness generation for multiple keccak hashes of little-endian `bytes`.
-pub fn multi_keccak<F: Field>(
-    bytes: &[Vec<u8>],
-    capacity: Option<usize>,
-    parameters: KeccakConfigParams,
-) -> (Vec<KeccakRow<F>>, Vec<[F; NUM_WORDS_TO_SQUEEZE]>) {
-    let num_rows_per_round = parameters.rows_per_round;
-    let mut rows =
-        Vec::with_capacity((1 + capacity.unwrap_or(0) * (NUM_ROUNDS + 1)) * num_rows_per_round);
-    // Dummy first row so that the initial data is absorbed
-    // The initial data doesn't really matter, `is_final` just needs to be disabled.
-    rows.append(&mut KeccakRow::dummy_rows(num_rows_per_round));
-    // Actual keccaks
-    let artifacts = bytes
-        .par_iter()
-        .map(|bytes| {
-            let num_keccak_f = get_num_keccak_f(bytes.len());
-            let mut squeeze_digests = Vec::with_capacity(num_keccak_f);
-            let mut rows = Vec::with_capacity(num_keccak_f * (NUM_ROUNDS + 1) * num_rows_per_round);
-            keccak(&mut rows, &mut squeeze_digests, bytes, parameters);
-            (rows, squeeze_digests)
-        })
-        .collect::<Vec<_>>();
-
-    let mut squeeze_digests = Vec::with_capacity(capacity.unwrap_or(0));
-    for (rows_part, squeezes) in artifacts {
-        rows.extend(rows_part);
-        squeeze_digests.extend(squeezes);
-    }
-
-    if let Some(capacity) = capacity {
-        // Pad with no data hashes to the expected capacity
-        while rows.len() < (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
-            keccak(&mut rows, &mut squeeze_digests, &[], parameters);
-        }
-        // Check that we are not over capacity
-        if rows.len() > (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
-            panic!("{:?}", Error::BoundsFailure);
-        }
-    }
-    (rows, squeeze_digests)
-}
+pub mod coprocessor;
+/// Module for native Keccak circuits in vanilla halo2.
+pub mod native;
diff --git a/hashes/zkevm/src/keccak/cell_manager.rs b/hashes/zkevm/src/keccak/native/cell_manager.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/cell_manager.rs
rename to hashes/zkevm/src/keccak/native/cell_manager.rs
diff --git a/hashes/zkevm/src/keccak/keccak_packed_multi.rs b/hashes/zkevm/src/keccak/native/keccak_packed_multi.rs
similarity index 98%
rename from hashes/zkevm/src/keccak/keccak_packed_multi.rs
rename to hashes/zkevm/src/keccak/native/keccak_packed_multi.rs
index b8af147b..5ad66641 100644
--- a/hashes/zkevm/src/keccak/keccak_packed_multi.rs
+++ b/hashes/zkevm/src/keccak/native/keccak_packed_multi.rs
@@ -167,7 +167,7 @@ pub(crate) type KeccakAssignedValue<'v, F> = Halo2AssignedCell<'v, F>;
 /// Recombines parts back together
 pub(crate) mod decode {
     use super::{Expr, Part, PartValue, PrimeField};
-    use crate::{halo2_proofs::plonk::Expression, keccak::param::*};
+    use crate::{halo2_proofs::plonk::Expression, keccak::native::param::*};
 
     pub(crate) fn expr<F: PrimeField>(parts: Vec<Part<F>>) -> Expression<F> {
         parts.iter().rev().fold(0.expr(), |acc, part| {
@@ -190,7 +190,7 @@ pub(crate) mod split {
     };
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, Expression},
-        keccak::util::{pack, pack_part, unpack, WordParts},
+        keccak::native::util::{pack, pack_part, unpack, WordParts},
     };
 
     #[allow(clippy::too_many_arguments)]
@@ -261,7 +261,7 @@ pub(crate) mod split_uniform {
     use super::decode;
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, Expression},
-        keccak::{
+        keccak::native::{
             param::*,
             target_part_sizes,
             util::{pack, pack_part, rotate, rotate_rev, unpack, WordParts},
@@ -493,9 +493,9 @@ pub(crate) mod transform {
 pub(crate) mod transform_to {
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, TableColumn},
-        keccak::{
+        keccak::native::{
             util::{pack, to_bytes, unpack},
-            {Cell, Expr, Field, KeccakRegion, Part, PartValue, PrimeField},
+            Cell, Expr, Field, KeccakRegion, Part, PartValue, PrimeField,
         },
     };
 
diff --git a/hashes/zkevm/src/keccak/native/mod.rs b/hashes/zkevm/src/keccak/native/mod.rs
new file mode 100644
index 00000000..79c1216c
--- /dev/null
+++ b/hashes/zkevm/src/keccak/native/mod.rs
@@ -0,0 +1,881 @@
+use self::{cell_manager::*, keccak_packed_multi::*, param::*, table::*, util::*};
+use crate::{
+    halo2_proofs::{
+        circuit::{Layouter, Region, Value},
+        halo2curves::ff::PrimeField,
+        plonk::{Column, ConstraintSystem, Error, Expression, Fixed, TableColumn, VirtualCells},
+        poly::Rotation,
+    },
+    util::{
+        constraint_builder::BaseConstraintBuilder,
+        eth_types::{self, Field},
+        expression::{and, from_bytes, not, select, sum, Expr},
+        word::{self, Word, WordExpr},
+    },
+};
+use halo2_base::utils::halo2::{raw_assign_advice, raw_assign_fixed};
+use itertools::Itertools;
+use log::{debug, info};
+use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
+use std::marker::PhantomData;
+
+pub mod cell_manager;
+pub mod keccak_packed_multi;
+pub mod param;
+pub mod table;
+#[cfg(test)]
+mod tests;
+pub mod util;
+/// Module for witness generation.
+pub mod witness;
+
+/// Configuration parameters to define [`KeccakCircuitConfig`]
+#[derive(Copy, Clone, Debug, Default)]
+pub struct KeccakConfigParams {
+    /// The circuit degree, i.e., circuit has 2<sup>k</sup> rows
+    pub k: u32,
+    /// The number of rows to use for each round in the keccak_f permutation
+    pub rows_per_round: usize,
+}
+
+/// KeccakConfig
+#[derive(Clone, Debug)]
+pub struct KeccakCircuitConfig<F> {
+    // Bool. True on 1st row of each round.
+    q_enable: Column<Fixed>,
+    // Bool. True on 1st row.
+    q_first: Column<Fixed>,
+    // Bool. True on 1st row of all rounds except last rounds.
+    q_round: Column<Fixed>,
+    // Bool. True on 1st row of last rounds.
+    q_absorb: Column<Fixed>,
+    // Bool. True on 1st row of last rounds.
+    q_round_last: Column<Fixed>,
+    // Bool. True on 1st row of rounds which might contain inputs.
+    // Note: first NUM_WORDS_TO_ABSORB rounds of each chunk might contain inputs.
+    // It "might" contain inputs because it's possible that a round only have paddings.
+    q_input: Column<Fixed>,
+    // Bool. True on 1st row of all last input round.
+    q_input_last: Column<Fixed>,
+
+    pub keccak_table: KeccakTable,
+
+    cell_manager: CellManager<F>,
+    round_cst: Column<Fixed>,
+    normalize_3: [TableColumn; 2],
+    normalize_4: [TableColumn; 2],
+    normalize_6: [TableColumn; 2],
+    chi_base_table: [TableColumn; 2],
+    pack_table: [TableColumn; 2],
+
+    // config parameters for convenience
+    pub parameters: KeccakConfigParams,
+
+    _marker: PhantomData<F>,
+}
+
+impl<F: Field> KeccakCircuitConfig<F> {
+    /// Return a new KeccakCircuitConfig
+    pub fn new(meta: &mut ConstraintSystem<F>, parameters: KeccakConfigParams) -> Self {
+        let k = parameters.k;
+        let num_rows_per_round = parameters.rows_per_round;
+
+        let q_enable = meta.fixed_column();
+        let q_first = meta.fixed_column();
+        let q_round = meta.fixed_column();
+        let q_absorb = meta.fixed_column();
+        let q_round_last = meta.fixed_column();
+        let q_input = meta.fixed_column();
+        let q_input_last = meta.fixed_column();
+        let round_cst = meta.fixed_column();
+        let keccak_table = KeccakTable::construct(meta);
+
+        let is_final = keccak_table.is_enabled;
+        let hash_word = keccak_table.output;
+
+        let normalize_3 = array_init::array_init(|_| meta.lookup_table_column());
+        let normalize_4 = array_init::array_init(|_| meta.lookup_table_column());
+        let normalize_6 = array_init::array_init(|_| meta.lookup_table_column());
+        let chi_base_table = array_init::array_init(|_| meta.lookup_table_column());
+        let pack_table = array_init::array_init(|_| meta.lookup_table_column());
+
+        let mut cell_manager = CellManager::new(num_rows_per_round);
+        let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+        let mut total_lookup_counter = 0;
+
+        let start_new_hash = |meta: &mut VirtualCells<F>, rot| {
+            // A new hash is started when the previous hash is done or on the first row
+            meta.query_fixed(q_first, rot) + meta.query_advice(is_final, rot)
+        };
+
+        // Round constant
+        let mut round_cst_expr = 0.expr();
+        meta.create_gate("Query round cst", |meta| {
+            round_cst_expr = meta.query_fixed(round_cst, Rotation::cur());
+            vec![0u64.expr()]
+        });
+        // State data
+        let mut s = vec![vec![0u64.expr(); 5]; 5];
+        let mut s_next = vec![vec![0u64.expr(); 5]; 5];
+        for i in 0..5 {
+            for j in 0..5 {
+                let cell = cell_manager.query_cell(meta);
+                s[i][j] = cell.expr();
+                s_next[i][j] = cell.at_offset(meta, num_rows_per_round as i32).expr();
+            }
+        }
+        // Absorb data
+        let absorb_from = cell_manager.query_cell(meta);
+        let absorb_data = cell_manager.query_cell(meta);
+        let absorb_result = cell_manager.query_cell(meta);
+        let mut absorb_from_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        let mut absorb_data_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        let mut absorb_result_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        for i in 0..NUM_WORDS_TO_ABSORB {
+            let rot = ((i + 1) * num_rows_per_round) as i32;
+            absorb_from_next[i] = absorb_from.at_offset(meta, rot).expr();
+            absorb_data_next[i] = absorb_data.at_offset(meta, rot).expr();
+            absorb_result_next[i] = absorb_result.at_offset(meta, rot).expr();
+        }
+
+        // Store the pre-state
+        let pre_s = s.clone();
+
+        // Absorb
+        // The absorption happening at the start of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 17 of the 24 rounds) a
+        // single word is absorbed so the work is spread out. The absorption is
+        // done simply by doing state + data and then normalizing the result to [0,1].
+        // We also need to convert the input data into bytes to calculate the input data
+        // rlc.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size = get_num_bits_per_absorb_lookup(k);
+        let input = absorb_from.expr() + absorb_data.expr();
+        let absorb_fat =
+            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
+        cell_manager.start_region();
+        let absorb_res = transform::expr(
+            "absorb",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            absorb_fat,
+            normalize_3,
+            true,
+        );
+        cb.require_equal("absorb result", decode::expr(absorb_res), absorb_result.expr());
+        info!("- Post absorb:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Squeeze
+        // The squeezing happening at the end of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
+        // single word is converted to bytes.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        // Potential optimization: could do multiple bytes per lookup
+        let packed_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, absorb_data.expr(), 0, 8, false, None);
+        cell_manager.start_region();
+        // input_bytes.len() = packed_parts.len() = 64 / 8 = 8 = NUM_BYTES_PER_WORD
+        let input_bytes = transform::expr(
+            "squeeze unpack",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            packed_parts,
+            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
+            true,
+        );
+        debug_assert_eq!(input_bytes.len(), NUM_BYTES_PER_WORD);
+
+        // Padding data
+        cell_manager.start_region();
+        let is_paddings = input_bytes.iter().map(|_| cell_manager.query_cell(meta)).collect_vec();
+        info!("- Post padding:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Theta
+        // Calculate
+        // - `c[i] = s[i][0] + s[i][1] + s[i][2] + s[i][3] + s[i][4]`
+        // - `bc[i] = normalize(c)`.
+        // - `t[i] = bc[(i + 4) % 5] + rot(bc[(i + 1)% 5], 1)`
+        // This is done by splitting the bc values in parts in a way
+        // that allows us to also calculate the rotated value "for free".
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size_c = get_num_bits_per_theta_c_lookup(k);
+        let mut c_parts = Vec::new();
+        for s in s.iter() {
+            // Calculate c and split into parts
+            let c = s[0].clone() + s[1].clone() + s[2].clone() + s[3].clone() + s[4].clone();
+            c_parts.push(split::expr(
+                meta,
+                &mut cell_manager,
+                &mut cb,
+                c,
+                1,
+                part_size_c,
+                false,
+                None,
+            ));
+        }
+        // Now calculate `bc` by normalizing `c`
+        cell_manager.start_region();
+        let mut bc = Vec::new();
+        for c in c_parts {
+            // Normalize c
+            bc.push(transform::expr(
+                "theta c",
+                meta,
+                &mut cell_manager,
+                &mut lookup_counter,
+                c,
+                normalize_6,
+                true,
+            ));
+        }
+        // Now do `bc[(i + 4) % 5] + rot(bc[(i + 1) % 5], 1)` using just expressions.
+        // We don't normalize the result here. We do it as part of the rho/pi step, even
+        // though we would only have to normalize 5 values instead of 25, because of the
+        // way the rho/pi and chi steps can be combined it's more efficient to
+        // do it there (the max value for chi is 4 already so that's the
+        // limiting factor).
+        let mut os = vec![vec![0u64.expr(); 5]; 5];
+        for i in 0..5 {
+            let t = decode::expr(bc[(i + 4) % 5].clone())
+                + decode::expr(rotate(bc[(i + 1) % 5].clone(), 1, part_size_c));
+            for j in 0..5 {
+                os[i][j] = s[i][j].clone() + t.clone();
+            }
+        }
+        s = os.clone();
+        info!("- Post theta:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Rho/Pi
+        // For the rotation of rho/pi we split up the words like expected, but in a way
+        // that allows reusing the same parts in an optimal way for the chi step.
+        // We can save quite a few columns by not recombining the parts after rho/pi and
+        // re-splitting the words again before chi. Instead we do chi directly
+        // on the output parts of rho/pi. For rho/pi specically we do
+        // `s[j][2 * i + 3 * j) % 5] = normalize(rot(s[i][j], RHOM[i][j]))`.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size = get_num_bits_per_base_chi_lookup(k);
+        // To combine the rho/pi/chi steps we have to ensure a specific layout so
+        // query those cells here first.
+        // For chi we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) & s[(i+2)%5][j])`. `j`
+        // remains static but `i` is accessed in a wrap around manner. To do this using
+        // multiple rows with lookups in a way that doesn't require any
+        // extra additional cells or selectors we have to put all `s[i]`'s on the same
+        // row. This isn't that strong of a requirement actually because we the
+        // words are split into multipe parts, and so only the parts at the same
+        // position of those words need to be on the same row.
+        let target_word_sizes = target_part_sizes(part_size);
+        let num_word_parts = target_word_sizes.len();
+        let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] = array_init::array_init(|_| {
+            array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
+        });
+        let mut num_columns = 0;
+        let mut column_starts = [0usize; 3];
+        for p in 0..3 {
+            column_starts[p] = cell_manager.start_region();
+            let mut row_idx = 0;
+            num_columns = 0;
+            for j in 0..5 {
+                for _ in 0..num_word_parts {
+                    for i in 0..5 {
+                        rho_pi_chi_cells[p][i][j]
+                            .push(cell_manager.query_cell_at_row(meta, row_idx));
+                    }
+                    if row_idx == 0 {
+                        num_columns += 1;
+                    }
+                    row_idx = (((row_idx as usize) + 1) % num_rows_per_round) as i32;
+                }
+            }
+        }
+        // Do the transformation, resulting in the word parts also being normalized.
+        let pi_region_start = cell_manager.start_region();
+        let mut os_parts = vec![vec![Vec::new(); 5]; 5];
+        for (j, os_part) in os_parts.iter_mut().enumerate() {
+            for i in 0..5 {
+                // Split s into parts
+                let s_parts = split_uniform::expr(
+                    meta,
+                    &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
+                    &mut cell_manager,
+                    &mut cb,
+                    s[i][j].clone(),
+                    RHO_MATRIX[i][j],
+                    part_size,
+                    true,
+                );
+                // Normalize the data to the target cells
+                let s_parts = transform_to::expr(
+                    "rho/pi",
+                    meta,
+                    &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
+                    &mut lookup_counter,
+                    s_parts.clone(),
+                    normalize_4,
+                    true,
+                );
+                os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
+            }
+        }
+        let pi_region_end = cell_manager.start_region();
+        // Pi parts range checks
+        // To make the uniform stuff work we had to combine some parts together
+        // in new cells (see split_uniform). Here we make sure those parts are range
+        // checked. Potential improvement: Could combine multiple smaller parts
+        // in a single lookup but doesn't save that much.
+        for c in pi_region_start..pi_region_end {
+            meta.lookup("pi part range check", |_| {
+                vec![(cell_manager.columns()[c].expr.clone(), normalize_4[0])]
+            });
+            lookup_counter += 1;
+        }
+        info!("- Post rho/pi:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Chi
+        // In groups of 5 columns, we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) &
+        // s[(i+2)%5][j])` five times, on each row (no selector needed).
+        // This is calculated by making use of `CHI_BASE_LOOKUP_TABLE`.
+        let mut lookup_counter = 0;
+        let part_size_base = get_num_bits_per_base_chi_lookup(k);
+        for idx in 0..num_columns {
+            // First fetch the cells we wan to use
+            let mut input: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
+            let mut output: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
+            for c in 0..5 {
+                input[c] = cell_manager.columns()[column_starts[1] + idx * 5 + c].expr.clone();
+                output[c] = cell_manager.columns()[column_starts[2] + idx * 5 + c].expr.clone();
+            }
+            // Now calculate `a ^ ((~b) & c)` by doing `lookup[3 - 2*a + b - c]`
+            for i in 0..5 {
+                let input = scatter::expr(3, part_size_base) - 2.expr() * input[i].clone()
+                    + input[(i + 1) % 5].clone()
+                    - input[(i + 2) % 5].clone();
+                let output = output[i].clone();
+                meta.lookup("chi base", |_| {
+                    vec![(input.clone(), chi_base_table[0]), (output.clone(), chi_base_table[1])]
+                });
+                lookup_counter += 1;
+            }
+        }
+        // Now just decode the parts after the chi transformation done with the lookups
+        // above.
+        let mut os = vec![vec![0u64.expr(); 5]; 5];
+        for (i, os) in os.iter_mut().enumerate() {
+            for (j, os) in os.iter_mut().enumerate() {
+                let mut parts = Vec::new();
+                for idx in 0..num_word_parts {
+                    parts.push(Part {
+                        num_bits: part_size_base,
+                        cell: rho_pi_chi_cells[2][i][j][idx].clone(),
+                        expr: rho_pi_chi_cells[2][i][j][idx].expr(),
+                    });
+                }
+                *os = decode::expr(parts);
+            }
+        }
+        s = os.clone();
+
+        // iota
+        // Simply do the single xor on state [0][0].
+        cell_manager.start_region();
+        let part_size = get_num_bits_per_absorb_lookup(k);
+        let input = s[0][0].clone() + round_cst_expr.clone();
+        let iota_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
+        cell_manager.start_region();
+        // Could share columns with absorb which may end up using 1 lookup/column
+        // fewer...
+        s[0][0] = decode::expr(transform::expr(
+            "iota",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            iota_parts,
+            normalize_3,
+            true,
+        ));
+        // Final results stored in the next row
+        for i in 0..5 {
+            for j in 0..5 {
+                cb.require_equal("next row check", s[i][j].clone(), s_next[i][j].clone());
+            }
+        }
+        info!("- Post chi:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        let mut lookup_counter = 0;
+        cell_manager.start_region();
+
+        // Squeeze data
+        let squeeze_from = cell_manager.query_cell(meta);
+        let mut squeeze_from_prev = vec![0u64.expr(); NUM_WORDS_TO_SQUEEZE];
+        for (idx, squeeze_from_prev) in squeeze_from_prev.iter_mut().enumerate() {
+            let rot = (-(idx as i32) - 1) * num_rows_per_round as i32;
+            *squeeze_from_prev = squeeze_from.at_offset(meta, rot).expr();
+        }
+        // Squeeze
+        // The squeeze happening at the end of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
+        // single word is converted to bytes.
+        // Potential optimization: could do multiple bytes per lookup
+        cell_manager.start_region();
+        // Unpack a single word into bytes (for the squeeze)
+        // Potential optimization: could do multiple bytes per lookup
+        let squeeze_from_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, squeeze_from.expr(), 0, 8, false, None);
+        cell_manager.start_region();
+        let squeeze_bytes = transform::expr(
+            "squeeze unpack",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            squeeze_from_parts,
+            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
+            true,
+        );
+        info!("- Post squeeze:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // The round constraints that we've been building up till now
+        meta.create_gate("round", |meta| cb.gate(meta.query_fixed(q_round, Rotation::cur())));
+
+        // Absorb
+        meta.create_gate("absorb", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let continue_hash = not::expr(start_new_hash(meta, Rotation::cur()));
+            let absorb_positions = get_absorb_positions();
+            let mut a_slice = 0;
+            for j in 0..5 {
+                for i in 0..5 {
+                    if absorb_positions.contains(&(i, j)) {
+                        cb.condition(continue_hash.clone(), |cb| {
+                            cb.require_equal(
+                                "absorb verify input",
+                                absorb_from_next[a_slice].clone(),
+                                pre_s[i][j].clone(),
+                            );
+                        });
+                        cb.require_equal(
+                            "absorb result copy",
+                            select::expr(
+                                continue_hash.clone(),
+                                absorb_result_next[a_slice].clone(),
+                                absorb_data_next[a_slice].clone(),
+                            ),
+                            s_next[i][j].clone(),
+                        );
+                        a_slice += 1;
+                    } else {
+                        cb.require_equal(
+                            "absorb state copy",
+                            pre_s[i][j].clone() * continue_hash.clone(),
+                            s_next[i][j].clone(),
+                        );
+                    }
+                }
+            }
+            cb.gate(meta.query_fixed(q_absorb, Rotation::cur()))
+        });
+
+        // Collect the bytes that are spread out over previous rows
+        let mut hash_bytes = Vec::new();
+        for i in 0..NUM_WORDS_TO_SQUEEZE {
+            for byte in squeeze_bytes.iter() {
+                let rot = (-(i as i32) - 1) * num_rows_per_round as i32;
+                hash_bytes.push(byte.cell.at_offset(meta, rot).expr());
+            }
+        }
+
+        // Squeeze
+        meta.create_gate("squeeze", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let start_new_hash = start_new_hash(meta, Rotation::cur());
+            // The words to squeeze
+            let hash_words: Vec<_> =
+                pre_s.into_iter().take(4).map(|a| a[0].clone()).take(4).collect();
+            // Verify if we converted the correct words to bytes on previous rows
+            for (idx, word) in hash_words.iter().enumerate() {
+                cb.condition(start_new_hash.clone(), |cb| {
+                    cb.require_equal(
+                        "squeeze verify packed",
+                        word.clone(),
+                        squeeze_from_prev[idx].clone(),
+                    );
+                });
+            }
+
+            let hash_bytes_le = hash_bytes.into_iter().rev().collect::<Vec<_>>();
+            cb.condition(start_new_hash, |cb| {
+                cb.require_equal_word(
+                    "output check",
+                    word::Word32::new(hash_bytes_le.try_into().expect("32 limbs")).to_word(),
+                    hash_word.map(|col| meta.query_advice(col, Rotation::cur())),
+                );
+            });
+            cb.gate(meta.query_fixed(q_round_last, Rotation::cur()))
+        });
+
+        // Some general input checks
+        meta.create_gate("input checks", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            cb.require_boolean("boolean is_final", meta.query_advice(is_final, Rotation::cur()));
+            cb.gate(meta.query_fixed(q_enable, Rotation::cur()))
+        });
+
+        // Enforce fixed values on the first row
+        meta.create_gate("first row", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            cb.require_zero(
+                "is_final needs to be disabled on the first row",
+                meta.query_advice(is_final, Rotation::cur()),
+            );
+            cb.gate(meta.query_fixed(q_first, Rotation::cur()))
+        });
+
+        // some utility query functions
+        let q = |col: Column<Fixed>, meta: &mut VirtualCells<'_, F>| {
+            meta.query_fixed(col, Rotation::cur())
+        };
+        /*
+        eg：
+            data:
+                get_num_rows_per_round: 18
+                input: "12345678abc"
+            table:
+                Note[1]: be careful: is_paddings is not column here! It is [Cell; 8] and it will be constrained later.
+                Note[2]: only first row of each round has constraints on bytes_left. This example just shows how witnesses are filled.
+        offset word_value bytes_left  is_paddings q_enable q_input_last
+        18     0x87654321    11          0         1        0 // 1st round begin
+        19        0          10          0         0        0
+        20        0          9           0         0        0
+        21        0          8           0         0        0
+        22        0          7           0         0        0
+        23        0          6           0         0        0
+        24        0          5           0         0        0
+        25        0          4           0         0        0
+        26        0          4           NA        0        0
+        ...
+        35        0          4           NA        0        0  // 1st round end
+        36      0xcba        3           0         1        1  // 2nd round begin
+        37        0          2           0         0        0
+        38        0          1           0         0        0
+        39        0          0           1         0        0
+        40        0          0           1         0        0
+        41        0          0           1         0        0
+        42        0          0           1         0        0
+        43        0          0           1         0        0
+        */
+
+        meta.create_gate("word_value", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let masked_input_bytes = input_bytes
+                .iter()
+                .zip(is_paddings.clone())
+                .map(|(input_byte, is_padding)| {
+                    input_byte.expr.clone() * not::expr(is_padding.expr().clone())
+                })
+                .collect_vec();
+            let input_word = from_bytes::expr(&masked_input_bytes);
+            cb.require_equal(
+                "word value",
+                input_word,
+                meta.query_advice(keccak_table.word_value, Rotation::cur()),
+            );
+            cb.gate(q(q_input, meta))
+        });
+        meta.create_gate("bytes_left", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let bytes_left_expr = meta.query_advice(keccak_table.bytes_left, Rotation::cur());
+
+            // bytes_left is 0 in the absolute first `rows_per_round` of the entire circuit, i.e., the first dummy round.
+            cb.condition(q(q_first, meta), |cb| {
+                cb.require_zero(
+                    "bytes_left needs to be zero on the absolute first dummy round",
+                    meta.query_advice(keccak_table.bytes_left, Rotation::cur()),
+                );
+            });
+            // is_final ==> bytes_left == 0.
+            // Note: is_final = true only in the last round, which doesn't have any data to absorb.
+            cb.condition(meta.query_advice(is_final, Rotation::cur()), |cb| {
+                cb.require_zero("bytes_left should be 0 when is_final", bytes_left_expr.clone());
+            });
+            //q_input[cur] ==> bytes_left[cur + num_rows_per_round] + word_len == bytes_left[cur]
+            cb.condition(q(q_input, meta), |cb| {
+                // word_len = NUM_BYTES_PER_WORD - sum(is_paddings)
+                let word_len = NUM_BYTES_PER_WORD.expr() - sum::expr(is_paddings.clone());
+                let bytes_left_next_expr =
+                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
+                cb.require_equal(
+                    "if there is a word in this round, bytes_left[curr + num_rows_per_round] + word_len == bytes_left[curr]",
+                    bytes_left_expr.clone(),
+                    bytes_left_next_expr + word_len,
+                );
+            });
+            // !q_input[cur] && !start_new_hash(cur) ==> bytes_left[cur + num_rows_per_round] == bytes_left[cur]
+            // !q_input[cur] && !start_new_hash(cur) === !(q_input[cur] || start_new_hash(cur))
+            // Because q_input[cur] and start_new_hash(cur) are never both true at the same time, we use + instead of or in order to save a degree.
+            cb.condition(not::expr(q(q_input, meta) + start_new_hash(meta, Rotation::cur())), |cb| {
+                let bytes_left_next_expr =
+                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
+                cb.require_equal(
+                    "if no input and not starting new hash, bytes_left should keep the same",
+                    bytes_left_expr,
+                    bytes_left_next_expr,
+                );
+            });
+
+            cb.gate(q(q_enable, meta))
+        });
+
+        // Enforce logic for when this block is the last block for a hash
+        let last_is_padding_in_block = is_paddings.last().unwrap().at_offset(
+            meta,
+            -(((NUM_ROUNDS + 1 - NUM_WORDS_TO_ABSORB) * num_rows_per_round) as i32),
+        );
+        meta.create_gate("is final", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            // All absorb rows except the first row
+            cb.condition(
+                meta.query_fixed(q_absorb, Rotation::cur())
+                    - meta.query_fixed(q_first, Rotation::cur()),
+                |cb| {
+                    cb.require_equal(
+                        "is_final needs to be the same as the last is_padding in the block",
+                        meta.query_advice(is_final, Rotation::cur()),
+                        last_is_padding_in_block.expr(),
+                    );
+                },
+            );
+            // For all the rows of a round, only the first row can have `is_final == 1`.
+            cb.condition(
+                (1..num_rows_per_round as i32)
+                    .map(|i| meta.query_fixed(q_enable, Rotation(-i)))
+                    .fold(0.expr(), |acc, elem| acc + elem),
+                |cb| {
+                    cb.require_zero(
+                        "is_final only when q_enable",
+                        meta.query_advice(is_final, Rotation::cur()),
+                    );
+                },
+            );
+            cb.gate(1.expr())
+        });
+
+        // Padding
+        // May be cleaner to do this padding logic in the byte conversion lookup but
+        // currently easier to do it like this.
+        let prev_is_padding =
+            is_paddings.last().unwrap().at_offset(meta, -(num_rows_per_round as i32));
+        meta.create_gate("padding", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let q_input = meta.query_fixed(q_input, Rotation::cur());
+            let q_input_last = meta.query_fixed(q_input_last, Rotation::cur());
+
+            // All padding selectors need to be boolean
+            for is_padding in is_paddings.iter() {
+                cb.condition(meta.query_fixed(q_enable, Rotation::cur()), |cb| {
+                    cb.require_boolean("is_padding boolean", is_padding.expr());
+                });
+            }
+            // This last padding selector will be used on the first round row so needs to be
+            // zero
+            cb.condition(meta.query_fixed(q_absorb, Rotation::cur()), |cb| {
+                cb.require_zero(
+                    "last is_padding should be zero on absorb rows",
+                    is_paddings.last().unwrap().expr(),
+                );
+            });
+            // Now for each padding selector
+            for idx in 0..is_paddings.len() {
+                // Previous padding selector can be on the previous row
+                let is_padding_prev =
+                    if idx == 0 { prev_is_padding.expr() } else { is_paddings[idx - 1].expr() };
+                let is_first_padding = is_paddings[idx].expr() - is_padding_prev.clone();
+
+                // Check padding transition 0 -> 1 done only once
+                cb.condition(q_input.expr(), |cb| {
+                    cb.require_boolean("padding step boolean", is_first_padding.clone());
+                });
+
+                // Padding start/intermediate/end byte checks
+                if idx == is_paddings.len() - 1 {
+                    // These can be combined in the future, but currently this would increase the
+                    // degree by one Padding start/intermediate byte, all
+                    // padding rows except the last one
+                    cb.condition(
+                        and::expr([q_input.expr() - q_input_last.expr(), is_paddings[idx].expr()]),
+                        |cb| {
+                            // Input bytes need to be zero, or one if this is the first padding byte
+                            cb.require_equal(
+                                "padding start/intermediate byte last byte",
+                                input_bytes[idx].expr.clone(),
+                                is_first_padding.expr(),
+                            );
+                        },
+                    );
+                    // Padding start/end byte, only on the last padding row
+                    cb.condition(and::expr([q_input_last.expr(), is_paddings[idx].expr()]), |cb| {
+                        // The input byte needs to be 128, unless it's also the first padding
+                        // byte then it's 129
+                        cb.require_equal(
+                            "padding start/end byte",
+                            input_bytes[idx].expr.clone(),
+                            is_first_padding.expr() + 128.expr(),
+                        );
+                    });
+                } else {
+                    // Padding start/intermediate byte
+                    cb.condition(and::expr([q_input.expr(), is_paddings[idx].expr()]), |cb| {
+                        // Input bytes need to be zero, or one if this is the first padding byte
+                        cb.require_equal(
+                            "padding start/intermediate byte",
+                            input_bytes[idx].expr.clone(),
+                            is_first_padding.expr(),
+                        );
+                    });
+                }
+            }
+            cb.gate(1.expr())
+        });
+
+        info!("Degree: {}", meta.degree());
+        info!("Minimum rows: {}", meta.minimum_rows());
+        info!("Total Lookups: {}", total_lookup_counter);
+        #[cfg(feature = "display")]
+        {
+            println!("Total Keccak Columns: {}", cell_manager.get_width());
+            std::env::set_var("KECCAK_ADVICE_COLUMNS", cell_manager.get_width().to_string());
+        }
+        #[cfg(not(feature = "display"))]
+        info!("Total Keccak Columns: {}", cell_manager.get_width());
+        info!("num unused cells: {}", cell_manager.get_num_unused_cells());
+        info!("part_size absorb: {}", get_num_bits_per_absorb_lookup(k));
+        info!("part_size theta: {}", get_num_bits_per_theta_c_lookup(k));
+        info!("part_size theta c: {}", get_num_bits_per_lookup(THETA_C_LOOKUP_RANGE, k));
+        info!("part_size theta t: {}", get_num_bits_per_lookup(4, k));
+        info!("part_size rho/pi: {}", get_num_bits_per_rho_pi_lookup(k));
+        info!("part_size chi base: {}", get_num_bits_per_base_chi_lookup(k));
+        info!("uniform part sizes: {:?}", target_part_sizes(get_num_bits_per_theta_c_lookup(k)));
+
+        KeccakCircuitConfig {
+            q_enable,
+            q_first,
+            q_round,
+            q_absorb,
+            q_round_last,
+            q_input,
+            q_input_last,
+            keccak_table,
+            cell_manager,
+            round_cst,
+            normalize_3,
+            normalize_4,
+            normalize_6,
+            chi_base_table,
+            pack_table,
+            parameters,
+            _marker: PhantomData,
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct KeccakAssignedRow<'v, F: Field> {
+    pub is_final: KeccakAssignedValue<'v, F>,
+    pub hash_lo: KeccakAssignedValue<'v, F>,
+    pub hash_hi: KeccakAssignedValue<'v, F>,
+    pub bytes_left: KeccakAssignedValue<'v, F>,
+    pub word_value: KeccakAssignedValue<'v, F>,
+}
+
+impl<F: Field> KeccakCircuitConfig<F> {
+    /// Returns vector of `is_final`, `length`, `hash.lo`, `hash.hi` for assigned rows
+    pub fn assign<'v>(
+        &self,
+        region: &mut Region<F>,
+        witness: &[KeccakRow<F>],
+    ) -> Vec<KeccakAssignedRow<'v, F>> {
+        witness
+            .iter()
+            .enumerate()
+            .map(|(offset, keccak_row)| self.set_row(region, offset, keccak_row))
+            .collect()
+    }
+
+    /// Output is `is_final`, `length`, `hash.lo`, `hash.hi` at that row
+    pub fn set_row<'v>(
+        &self,
+        region: &mut Region<F>,
+        offset: usize,
+        row: &KeccakRow<F>,
+    ) -> KeccakAssignedRow<'v, F> {
+        // Fixed selectors
+        for (_, column, value) in &[
+            ("q_enable", self.q_enable, F::from(row.q_enable)),
+            ("q_first", self.q_first, F::from(offset == 0)),
+            ("q_round", self.q_round, F::from(row.q_round)),
+            ("q_round_last", self.q_round_last, F::from(row.q_round_last)),
+            ("q_absorb", self.q_absorb, F::from(row.q_absorb)),
+            ("q_input", self.q_input, F::from(row.q_input)),
+            ("q_input_last", self.q_input_last, F::from(row.q_input_last)),
+        ] {
+            raw_assign_fixed(region, *column, offset, *value);
+        }
+
+        // Keccak data
+        let [is_final, hash_lo, hash_hi, bytes_left, word_value] = [
+            ("is_final", self.keccak_table.is_enabled, Value::known(F::from(row.is_final))),
+            ("hash_lo", self.keccak_table.output.lo(), row.hash.lo()),
+            ("hash_hi", self.keccak_table.output.hi(), row.hash.hi()),
+            ("bytes_left", self.keccak_table.bytes_left, Value::known(row.bytes_left)),
+            ("word_value", self.keccak_table.word_value, Value::known(row.word_value)),
+        ]
+        .map(|(_name, column, value)| raw_assign_advice(region, column, offset, value));
+
+        // Cell values
+        row.cell_values.iter().zip(self.cell_manager.columns()).for_each(|(bit, column)| {
+            raw_assign_advice(region, column.advice, offset, Value::known(*bit));
+        });
+
+        // Round constant
+        raw_assign_fixed(region, self.round_cst, offset, row.round_cst);
+
+        KeccakAssignedRow { is_final, hash_lo, hash_hi, bytes_left, word_value }
+    }
+
+    pub fn load_aux_tables(&self, layouter: &mut impl Layouter<F>, k: u32) -> Result<(), Error> {
+        load_normalize_table(layouter, "normalize_6", &self.normalize_6, 6u64, k)?;
+        load_normalize_table(layouter, "normalize_4", &self.normalize_4, 4u64, k)?;
+        load_normalize_table(layouter, "normalize_3", &self.normalize_3, 3u64, k)?;
+        load_lookup_table(
+            layouter,
+            "chi base",
+            &self.chi_base_table,
+            get_num_bits_per_base_chi_lookup(k),
+            &CHI_BASE_LOOKUP_TABLE,
+        )?;
+        load_pack_table(layouter, &self.pack_table)
+    }
+}
diff --git a/hashes/zkevm/src/keccak/param.rs b/hashes/zkevm/src/keccak/native/param.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/param.rs
rename to hashes/zkevm/src/keccak/native/param.rs
diff --git a/hashes/zkevm/src/keccak/table.rs b/hashes/zkevm/src/keccak/native/table.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/table.rs
rename to hashes/zkevm/src/keccak/native/table.rs
diff --git a/hashes/zkevm/src/keccak/tests.rs b/hashes/zkevm/src/keccak/native/tests.rs
similarity index 99%
rename from hashes/zkevm/src/keccak/tests.rs
rename to hashes/zkevm/src/keccak/native/tests.rs
index 6dcf3947..2c3143ac 100644
--- a/hashes/zkevm/src/keccak/tests.rs
+++ b/hashes/zkevm/src/keccak/native/tests.rs
@@ -1,4 +1,4 @@
-use super::*;
+use super::{witness::*, *};
 use crate::halo2_proofs::{
     circuit::SimpleFloorPlanner,
     dev::MockProver,
diff --git a/hashes/zkevm/src/keccak/util.rs b/hashes/zkevm/src/keccak/native/util.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/util.rs
rename to hashes/zkevm/src/keccak/native/util.rs
diff --git a/hashes/zkevm/src/keccak/native/witness.rs b/hashes/zkevm/src/keccak/native/witness.rs
new file mode 100644
index 00000000..a29723c4
--- /dev/null
+++ b/hashes/zkevm/src/keccak/native/witness.rs
@@ -0,0 +1,417 @@
+use super::*;
+
+/// Witness generation for multiple keccak hashes of little-endian `bytes`.
+pub fn multi_keccak<F: Field>(
+    bytes: &[Vec<u8>],
+    capacity: Option<usize>,
+    parameters: KeccakConfigParams,
+) -> (Vec<KeccakRow<F>>, Vec<[F; NUM_WORDS_TO_SQUEEZE]>) {
+    let num_rows_per_round = parameters.rows_per_round;
+    let mut rows =
+        Vec::with_capacity((1 + capacity.unwrap_or(0) * (NUM_ROUNDS + 1)) * num_rows_per_round);
+    // Dummy first row so that the initial data is absorbed
+    // The initial data doesn't really matter, `is_final` just needs to be disabled.
+    rows.append(&mut KeccakRow::dummy_rows(num_rows_per_round));
+    // Actual keccaks
+    let artifacts = bytes
+        .par_iter()
+        .map(|bytes| {
+            let num_keccak_f = get_num_keccak_f(bytes.len());
+            let mut squeeze_digests = Vec::with_capacity(num_keccak_f);
+            let mut rows = Vec::with_capacity(num_keccak_f * (NUM_ROUNDS + 1) * num_rows_per_round);
+            keccak(&mut rows, &mut squeeze_digests, bytes, parameters);
+            (rows, squeeze_digests)
+        })
+        .collect::<Vec<_>>();
+
+    let mut squeeze_digests = Vec::with_capacity(capacity.unwrap_or(0));
+    for (rows_part, squeezes) in artifacts {
+        rows.extend(rows_part);
+        squeeze_digests.extend(squeezes);
+    }
+
+    if let Some(capacity) = capacity {
+        // Pad with no data hashes to the expected capacity
+        while rows.len() < (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
+            keccak(&mut rows, &mut squeeze_digests, &[], parameters);
+        }
+        // Check that we are not over capacity
+        if rows.len() > (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
+            panic!("{:?}", Error::BoundsFailure);
+        }
+    }
+    (rows, squeeze_digests)
+}
+/// Witness generation for keccak hash of little-endian `bytes`.
+fn keccak<F: Field>(
+    rows: &mut Vec<KeccakRow<F>>,
+    squeeze_digests: &mut Vec<[F; NUM_WORDS_TO_SQUEEZE]>,
+    bytes: &[u8],
+    parameters: KeccakConfigParams,
+) {
+    let k = parameters.k;
+    let num_rows_per_round = parameters.rows_per_round;
+
+    let mut bits = into_bits(bytes);
+    let mut s = [[F::ZERO; 5]; 5];
+    let absorb_positions = get_absorb_positions();
+    let num_bytes_in_last_block = bytes.len() % RATE;
+    let two = F::from(2u64);
+
+    // Padding
+    bits.push(1);
+    while (bits.len() + 1) % RATE_IN_BITS != 0 {
+        bits.push(0);
+    }
+    bits.push(1);
+
+    // running length of absorbed input in bytes
+    let mut length = 0;
+    let chunks = bits.chunks(RATE_IN_BITS);
+    let num_chunks = chunks.len();
+
+    let mut cell_managers = Vec::with_capacity(NUM_ROUNDS + 1);
+    let mut regions = Vec::with_capacity(NUM_ROUNDS + 1);
+    // keeps track of running lengths over all rounds in an absorb step
+    let mut round_lengths = Vec::with_capacity(NUM_ROUNDS + 1);
+    let mut hash_words = [F::ZERO; NUM_WORDS_TO_SQUEEZE];
+    let mut hash = Word::default();
+
+    for (idx, chunk) in chunks.enumerate() {
+        let is_final_block = idx == num_chunks - 1;
+
+        let mut absorb_rows = Vec::new();
+        // Absorb
+        for (idx, &(i, j)) in absorb_positions.iter().enumerate() {
+            let absorb = pack(&chunk[idx * 64..(idx + 1) * 64]);
+            let from = s[i][j];
+            s[i][j] = field_xor(s[i][j], absorb);
+            absorb_rows.push(AbsorbData { from, absorb, result: s[i][j] });
+        }
+
+        // better memory management to clear already allocated Vecs
+        cell_managers.clear();
+        regions.clear();
+        round_lengths.clear();
+
+        for round in 0..NUM_ROUNDS + 1 {
+            let mut cell_manager = CellManager::new(num_rows_per_round);
+            let mut region = KeccakRegion::new();
+
+            let mut absorb_row = AbsorbData::default();
+            if round < NUM_WORDS_TO_ABSORB {
+                absorb_row = absorb_rows[round].clone();
+            }
+
+            // State data
+            for s in &s {
+                for s in s {
+                    let cell = cell_manager.query_cell_value();
+                    cell.assign(&mut region, 0, *s);
+                }
+            }
+
+            // Absorb data
+            let absorb_from = cell_manager.query_cell_value();
+            let absorb_data = cell_manager.query_cell_value();
+            let absorb_result = cell_manager.query_cell_value();
+            absorb_from.assign(&mut region, 0, absorb_row.from);
+            absorb_data.assign(&mut region, 0, absorb_row.absorb);
+            absorb_result.assign(&mut region, 0, absorb_row.result);
+
+            // Absorb
+            cell_manager.start_region();
+            let part_size = get_num_bits_per_absorb_lookup(k);
+            let input = absorb_row.from + absorb_row.absorb;
+            let absorb_fat =
+                split::value(&mut cell_manager, &mut region, input, 0, part_size, false, None);
+            cell_manager.start_region();
+            let _absorb_result = transform::value(
+                &mut cell_manager,
+                &mut region,
+                absorb_fat.clone(),
+                true,
+                |v| v & 1,
+                true,
+            );
+
+            // Padding
+            cell_manager.start_region();
+            // Unpack a single word into bytes (for the absorption)
+            // Potential optimization: could do multiple bytes per lookup
+            let packed =
+                split::value(&mut cell_manager, &mut region, absorb_row.absorb, 0, 8, false, None);
+            cell_manager.start_region();
+            let input_bytes =
+                transform::value(&mut cell_manager, &mut region, packed, false, |v| *v, true);
+            cell_manager.start_region();
+            let is_paddings =
+                input_bytes.iter().map(|_| cell_manager.query_cell_value()).collect::<Vec<_>>();
+            debug_assert_eq!(is_paddings.len(), NUM_BYTES_PER_WORD);
+            if round < NUM_WORDS_TO_ABSORB {
+                for (padding_idx, is_padding) in is_paddings.iter().enumerate() {
+                    let byte_idx = round * NUM_BYTES_PER_WORD + padding_idx;
+                    let padding = if is_final_block && byte_idx >= num_bytes_in_last_block {
+                        true
+                    } else {
+                        length += 1;
+                        false
+                    };
+                    is_padding.assign(&mut region, 0, F::from(padding));
+                }
+            }
+            cell_manager.start_region();
+
+            if round != NUM_ROUNDS {
+                // Theta
+                let part_size = get_num_bits_per_theta_c_lookup(k);
+                let mut bcf = Vec::new();
+                for s in &s {
+                    let c = s[0] + s[1] + s[2] + s[3] + s[4];
+                    let bc_fat =
+                        split::value(&mut cell_manager, &mut region, c, 1, part_size, false, None);
+                    bcf.push(bc_fat);
+                }
+                cell_manager.start_region();
+                let mut bc = Vec::new();
+                for bc_fat in bcf {
+                    let bc_norm = transform::value(
+                        &mut cell_manager,
+                        &mut region,
+                        bc_fat.clone(),
+                        true,
+                        |v| v & 1,
+                        true,
+                    );
+                    bc.push(bc_norm);
+                }
+                cell_manager.start_region();
+                let mut os = [[F::ZERO; 5]; 5];
+                for i in 0..5 {
+                    let t = decode::value(bc[(i + 4) % 5].clone())
+                        + decode::value(rotate(bc[(i + 1) % 5].clone(), 1, part_size));
+                    for j in 0..5 {
+                        os[i][j] = s[i][j] + t;
+                    }
+                }
+                s = os;
+                cell_manager.start_region();
+
+                // Rho/Pi
+                let part_size = get_num_bits_per_base_chi_lookup(k);
+                let target_word_sizes = target_part_sizes(part_size);
+                let num_word_parts = target_word_sizes.len();
+                let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] =
+                    array_init::array_init(|_| {
+                        array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
+                    });
+                let mut column_starts = [0usize; 3];
+                for p in 0..3 {
+                    column_starts[p] = cell_manager.start_region();
+                    let mut row_idx = 0;
+                    for j in 0..5 {
+                        for _ in 0..num_word_parts {
+                            for i in 0..5 {
+                                rho_pi_chi_cells[p][i][j]
+                                    .push(cell_manager.query_cell_value_at_row(row_idx as i32));
+                            }
+                            row_idx = (row_idx + 1) % num_rows_per_round;
+                        }
+                    }
+                }
+                cell_manager.start_region();
+                let mut os_parts: [[Vec<PartValue<F>>; 5]; 5] =
+                    array_init::array_init(|_| array_init::array_init(|_| Vec::new()));
+                for (j, os_part) in os_parts.iter_mut().enumerate() {
+                    for i in 0..5 {
+                        let s_parts = split_uniform::value(
+                            &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
+                            &mut cell_manager,
+                            &mut region,
+                            s[i][j],
+                            RHO_MATRIX[i][j],
+                            part_size,
+                            true,
+                        );
+
+                        let s_parts = transform_to::value(
+                            &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
+                            &mut region,
+                            s_parts.clone(),
+                            true,
+                            |v| v & 1,
+                        );
+                        os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
+                    }
+                }
+                cell_manager.start_region();
+
+                // Chi
+                let part_size_base = get_num_bits_per_base_chi_lookup(k);
+                let three_packed = pack::<F>(&vec![3u8; part_size_base]);
+                let mut os = [[F::ZERO; 5]; 5];
+                for j in 0..5 {
+                    for i in 0..5 {
+                        let mut s_parts = Vec::new();
+                        for ((part_a, part_b), part_c) in os_parts[i][j]
+                            .iter()
+                            .zip(os_parts[(i + 1) % 5][j].iter())
+                            .zip(os_parts[(i + 2) % 5][j].iter())
+                        {
+                            let value =
+                                three_packed - two * part_a.value + part_b.value - part_c.value;
+                            s_parts.push(PartValue {
+                                num_bits: part_size_base,
+                                rot: j as i32,
+                                value,
+                            });
+                        }
+                        os[i][j] = decode::value(transform_to::value(
+                            &rho_pi_chi_cells[2][i][j],
+                            &mut region,
+                            s_parts.clone(),
+                            true,
+                            |v| CHI_BASE_LOOKUP_TABLE[*v as usize],
+                        ));
+                    }
+                }
+                s = os;
+                cell_manager.start_region();
+
+                // iota
+                let part_size = get_num_bits_per_absorb_lookup(k);
+                let input = s[0][0] + pack_u64::<F>(ROUND_CST[round]);
+                let iota_parts = split::value::<F>(
+                    &mut cell_manager,
+                    &mut region,
+                    input,
+                    0,
+                    part_size,
+                    false,
+                    None,
+                );
+                cell_manager.start_region();
+                s[0][0] = decode::value(transform::value(
+                    &mut cell_manager,
+                    &mut region,
+                    iota_parts.clone(),
+                    true,
+                    |v| v & 1,
+                    true,
+                ));
+            }
+
+            // Assign the hash result
+            let is_final = is_final_block && round == NUM_ROUNDS;
+            hash = if is_final {
+                let hash_bytes_le = s
+                    .into_iter()
+                    .take(4)
+                    .flat_map(|a| to_bytes::value(&unpack(a[0])))
+                    .rev()
+                    .collect::<Vec<_>>();
+
+                let word: Word<Value<F>> =
+                    Word::from(eth_types::Word::from_little_endian(hash_bytes_le.as_slice()))
+                        .map(Value::known);
+                word
+            } else {
+                Word::default().into_value()
+            };
+
+            // The words to squeeze out: this is the hash digest as words with
+            // NUM_BYTES_PER_WORD (=8) bytes each
+            for (hash_word, a) in hash_words.iter_mut().zip(s.iter()) {
+                *hash_word = a[0];
+            }
+
+            round_lengths.push(length);
+
+            cell_managers.push(cell_manager);
+            regions.push(region);
+        }
+
+        // Now that we know the state at the end of the rounds, set the squeeze data
+        let num_rounds = cell_managers.len();
+        for (idx, word) in hash_words.iter().enumerate() {
+            let cell_manager = &mut cell_managers[num_rounds - 2 - idx];
+            let region = &mut regions[num_rounds - 2 - idx];
+
+            cell_manager.start_region();
+            let squeeze_packed = cell_manager.query_cell_value();
+            squeeze_packed.assign(region, 0, *word);
+
+            cell_manager.start_region();
+            let packed = split::value(cell_manager, region, *word, 0, 8, false, None);
+            cell_manager.start_region();
+            transform::value(cell_manager, region, packed, false, |v| *v, true);
+        }
+        squeeze_digests.push(hash_words);
+
+        for round in 0..NUM_ROUNDS + 1 {
+            let round_cst = pack_u64(ROUND_CST[round]);
+
+            for row_idx in 0..num_rows_per_round {
+                let word_value = if round < NUM_WORDS_TO_ABSORB && row_idx == 0 {
+                    let byte_idx = (idx * NUM_WORDS_TO_ABSORB + round) * NUM_BYTES_PER_WORD;
+                    if byte_idx >= bytes.len() {
+                        0
+                    } else {
+                        let end = std::cmp::min(byte_idx + NUM_BYTES_PER_WORD, bytes.len());
+                        let mut word_bytes = bytes[byte_idx..end].to_vec().clone();
+                        word_bytes.resize(NUM_BYTES_PER_WORD, 0);
+                        u64::from_le_bytes(word_bytes.try_into().unwrap())
+                    }
+                } else {
+                    0
+                };
+                let byte_idx = if round < NUM_WORDS_TO_ABSORB {
+                    round * NUM_BYTES_PER_WORD + std::cmp::min(row_idx, NUM_BYTES_PER_WORD - 1)
+                } else {
+                    NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD
+                } + idx * NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD;
+                let bytes_left = if byte_idx >= bytes.len() { 0 } else { bytes.len() - byte_idx };
+                rows.push(KeccakRow {
+                    q_enable: row_idx == 0,
+                    q_round: row_idx == 0 && round < NUM_ROUNDS,
+                    q_absorb: row_idx == 0 && round == NUM_ROUNDS,
+                    q_round_last: row_idx == 0 && round == NUM_ROUNDS,
+                    q_input: row_idx == 0 && round < NUM_WORDS_TO_ABSORB,
+                    q_input_last: row_idx == 0 && round == NUM_WORDS_TO_ABSORB - 1,
+                    round_cst,
+                    is_final: is_final_block && round == NUM_ROUNDS && row_idx == 0,
+                    cell_values: regions[round].rows.get(row_idx).unwrap_or(&vec![]).clone(),
+                    hash,
+                    bytes_left: F::from_u128(bytes_left as u128),
+                    word_value: F::from_u128(word_value as u128),
+                });
+                #[cfg(debug_assertions)]
+                {
+                    let mut r = rows.last().unwrap().clone();
+                    r.cell_values.clear();
+                    log::trace!("offset {:?} row idx {} row {:?}", rows.len() - 1, row_idx, r);
+                }
+            }
+            log::trace!(" = = = = = = round {} end", round);
+        }
+        log::trace!(" ====================== chunk {} end", idx);
+    }
+
+    #[cfg(debug_assertions)]
+    {
+        let hash_bytes = s
+            .into_iter()
+            .take(4)
+            .map(|a| {
+                pack_with_base::<F>(&unpack(a[0]), 2)
+                    .to_bytes_le()
+                    .into_iter()
+                    .take(8)
+                    .collect::<Vec<_>>()
+                    .to_vec()
+            })
+            .collect::<Vec<_>>();
+        debug!("hash: {:x?}", &(hash_bytes[0..4].concat()));
+        assert_eq!(length, bytes.len());
+    }
+}
diff --git a/hashes/zkevm/src/lib.rs b/hashes/zkevm/src/lib.rs
index c1ed5026..272e4bf8 100644
--- a/hashes/zkevm/src/lib.rs
+++ b/hashes/zkevm/src/lib.rs
@@ -7,5 +7,3 @@ use halo2_base::halo2_proofs;
 pub mod keccak;
 /// Util
 pub mod util;
-
-pub use keccak::KeccakCircuitConfig as KeccakConfig;