feat: permutation argument optimizations (#10960)

A handful of optimizations for the large ambient trace setting, mostly to do with the grand product argument. Total savings is about 1s on the "17 in 20" benchmark. - Only perform computation for the grand product on active rows of the trace. This means (1) only setting the values of sigma/id on the active rows (they remain zero elsewhere since those values don't contribute to the grand product anyway). And (2) only compute the grand product at active rows then populate the constant regions as a final step. These are both facilitated by constructing a vector `active_row_idxs` which explicitly contains the indices of the active rows. This makes it easier to multithread and is much more efficient than looping over the entire domain and using something like `check_is_active()` which itself has low overhead but results in huge disparities in the distribution of actual work across threads. - Replace a default initialized `std::vector` in PG with a `Polynomial` simply to take advantage of the optimized constructor Branch "17 in 20" benchmark `ClientIVCBench/Full/6 20075 ms 17763 ms` Master "17 in 20" benchmark `ClientIVCBench/Full/6 21054 ms 18395 ms` The conventional benchmark ("19 in 19") shows a very minor improvement, as expected: Branch: `ClientIVCBench/Full/6 22231 ms 19857 ms` Master: `ClientIVCBench/Full/6 22505 ms 19536 ms`
AztecProtocol · Jan 9, 2025 · de99603 · de99603
1 parent bcc0168
commit de99603
Show file tree

Hide file tree

Showing 14 changed files with 232 additions and 94 deletions.
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp
@@ -170,6 +170,22 @@ void parallel_for_heuristic(size_t num_points,
     });
 };
 
+MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
+{
+    size_t num_threads = calculate_num_threads(num_iterations, min_iterations_per_thread);
+    const size_t thread_size = num_iterations / num_threads;
+
+    // Cumpute the index bounds for each thread
+    std::vector<size_t> start(num_threads);
+    std::vector<size_t> end(num_threads);
+    for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+        start[thread_idx] = thread_idx * thread_size;
+        end[thread_idx] = (thread_idx == num_threads - 1) ? num_iterations : (thread_idx + 1) * thread_size;
+    }
+
+    return MultithreadData{ num_threads, start, end };
+}
+
 /**
  * @brief calculates number of threads to create based on minimum iterations per thread
  * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`

diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp
@@ -88,6 +88,24 @@ std::vector<Accum> parallel_for_heuristic(size_t num_points,
 
 const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
 
+struct MultithreadData {
+    size_t num_threads;
+    // index bounds for each thread
+    std::vector<size_t> start;
+    std::vector<size_t> end;
+};
+
+/**
+ * @brief Calculates number of threads and index bounds for each thread
+ * @details Finds the number of cpus with calculate_num_threads() then divides domain evenly amongst threads
+ *
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+MultithreadData calculate_thread_data(size_t num_iterations,
+                                      size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
+
 /**
  * @brief calculates number of threads to create based on minimum iterations per thread
  * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`

diff --git a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp
@@ -98,6 +98,29 @@ class PrecomputedEntitiesBase {
     uint64_t log_circuit_size;
     uint64_t num_public_inputs;
 };
+// Specifies the regions of the execution trace containing non-trivial wire values
+struct ActiveRegionData {
+    void add_range(const size_t start, const size_t end)
+    {
+        ASSERT(start >= current_end); // ranges should be non-overlapping and increasing
+        ranges.emplace_back(start, end);
+        for (size_t i = start; i < end; ++i) {
+            idxs.push_back(i);
+        }
+        current_end = end;
+    }
+
+    std::vector<std::pair<size_t, size_t>> get_ranges() const { return ranges; }
+    size_t get_idx(const size_t idx) const { return idxs[idx]; }
+    std::pair<size_t, size_t> get_range(const size_t idx) const { return ranges.at(idx); }
+    size_t size() const { return idxs.size(); }
+    size_t num_ranges() const { return ranges.size(); }
+
+  private:
+    std::vector<std::pair<size_t, size_t>> ranges; // active ranges [start_i, end_i) of the execution trace
+    std::vector<size_t> idxs;                      // full set of poly indices corresposponding to active ranges
+    size_t current_end{ 0 };                       // end of last range; for ensuring monotonicity of ranges
+};
 
 /**
  * @brief Base proving key class.
@@ -123,8 +146,7 @@ template <typename FF, typename CommitmentKey_> class ProvingKey_ {
     // folded element by element.
     std::vector<FF> public_inputs;
 
-    // Ranges of the form [start, end) where witnesses have non-zero values (hence the execution trace is "active")
-    std::vector<std::pair<size_t, size_t>> active_block_ranges;
+    ActiveRegionData active_region_data; // specifies active regions of execution trace
 
     ProvingKey_() = default;
     ProvingKey_(const size_t dyadic_circuit_size,

diff --git a/barretenberg/cpp/src/barretenberg/goblin/mock_circuits.hpp b/barretenberg/cpp/src/barretenberg/goblin/mock_circuits.hpp
@@ -191,7 +191,6 @@ class GoblinMockCircuits {
     static void construct_simple_circuit(MegaBuilder& builder)
     {
         PROFILE_THIS();
-
         add_some_ecc_op_gates(builder);
         MockCircuits::construct_arithmetic_circuit(builder);
     }

diff --git a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp
@@ -224,33 +224,43 @@ void compute_honk_style_permutation_lagrange_polynomials_from_mapping(
     using FF = typename Flavor::FF;
     const size_t num_gates = proving_key->circuit_size;
 
+    size_t domain_size = proving_key->active_region_data.size();
+
+    const MultithreadData thread_data = calculate_thread_data(domain_size);
+
     size_t wire_idx = 0;
     for (auto& current_permutation_poly : permutation_polynomials) {
-        ITERATE_OVER_DOMAIN_START(proving_key->evaluation_domain);
-        auto idx = static_cast<ptrdiff_t>(i);
-        const auto& current_row_idx = permutation_mappings[wire_idx].row_idx[idx];
-        const auto& current_col_idx = permutation_mappings[wire_idx].col_idx[idx];
-        const auto& current_is_tag = permutation_mappings[wire_idx].is_tag[idx];
-        const auto& current_is_public_input = permutation_mappings[wire_idx].is_public_input[idx];
-        if (current_is_public_input) {
-            // We intentionally want to break the cycles of the public input variables.
-            // During the witness generation, the left and right wire polynomials at idx i contain the i-th public
-            // input. The CyclicPermutation created for these variables always start with (i) -> (n+i), followed by
-            // the indices of the variables in the "real" gates. We make i point to -(i+1), so that the only way of
-            // repairing the cycle is add the mapping
-            //  -(i+1) -> (n+i)
-            // These indices are chosen so they can easily be computed by the verifier. They can expect the running
-            // product to be equal to the "public input delta" that is computed in <honk/utils/grand_product_delta.hpp>
-            current_permutation_poly.at(i) = -FF(current_row_idx + 1 + num_gates * current_col_idx);
-        } else if (current_is_tag) {
-            // Set evaluations to (arbitrary) values disjoint from non-tag values
-            current_permutation_poly.at(i) = num_gates * Flavor::NUM_WIRES + current_row_idx;
-        } else {
-            // For the regular permutation we simply point to the next location by setting the evaluation to its
-            // idx
-            current_permutation_poly.at(i) = FF(current_row_idx + num_gates * current_col_idx);
-        }
-        ITERATE_OVER_DOMAIN_END;
+        parallel_for(thread_data.num_threads, [&](size_t j) {
+            const size_t start = thread_data.start[j];
+            const size_t end = thread_data.end[j];
+            for (size_t i = start; i < end; ++i) {
+                const size_t poly_idx = proving_key->active_region_data.get_idx(i);
+                const auto idx = static_cast<ptrdiff_t>(poly_idx);
+                const auto& current_row_idx = permutation_mappings[wire_idx].row_idx[idx];
+                const auto& current_col_idx = permutation_mappings[wire_idx].col_idx[idx];
+                const auto& current_is_tag = permutation_mappings[wire_idx].is_tag[idx];
+                const auto& current_is_public_input = permutation_mappings[wire_idx].is_public_input[idx];
+                if (current_is_public_input) {
+                    // We intentionally want to break the cycles of the public input variables.
+                    // During the witness generation, the left and right wire polynomials at idx i contain the i-th
+                    // public input. The CyclicPermutation created for these variables always start with (i) -> (n+i),
+                    // followed by the indices of the variables in the "real" gates. We make i point to
+                    // -(i+1), so that the only way of repairing the cycle is add the mapping
+                    //  -(i+1) -> (n+i)
+                    // These indices are chosen so they can easily be computed by the verifier. They can expect
+                    // the running product to be equal to the "public input delta" that is computed
+                    // in <honk/utils/grand_product_delta.hpp>
+                    current_permutation_poly.at(poly_idx) = -FF(current_row_idx + 1 + num_gates * current_col_idx);
+                } else if (current_is_tag) {
+                    // Set evaluations to (arbitrary) values disjoint from non-tag values
+                    current_permutation_poly.at(poly_idx) = num_gates * Flavor::NUM_WIRES + current_row_idx;
+                } else {
+                    // For the regular permutation we simply point to the next location by setting the
+                    // evaluation to its idx
+                    current_permutation_poly.at(poly_idx) = FF(current_row_idx + num_gates * current_col_idx);
+                }
+            }
+        });
         wire_idx++;
     }
 }

diff --git a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/library/grand_product_library.hpp b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/library/grand_product_library.hpp
@@ -3,8 +3,10 @@
 #include "barretenberg/common/debug_log.hpp"
 #include "barretenberg/common/thread.hpp"
 #include "barretenberg/common/zip_view.hpp"
+#include "barretenberg/flavor/flavor.hpp"
 #include "barretenberg/plonk/proof_system/proving_key/proving_key.hpp"
 #include "barretenberg/relations/relation_parameters.hpp"
+#include "barretenberg/trace_to_polynomials/trace_to_polynomials.hpp"
 #include <typeinfo>
 
 namespace bb {
@@ -47,74 +49,68 @@ namespace bb {
  *
  * Note: Step (3) utilizes Montgomery batch inversion to replace n-many inversions with
  *
+ * @note This method makes use of the fact that there are at most as many unique entries in the grand product as active
+ * rows in the execution trace to efficiently compute the grand product when a structured trace is in use. I.e. the
+ * computation peformed herein is proportional to the number of active rows in the trace and the constant values in the
+ * inactive regions are simply populated from known values on the last step.
+ *
  * @tparam Flavor
  * @tparam GrandProdRelation
  * @param full_polynomials
  * @param relation_parameters
  * @param size_override optional size of the domain; otherwise based on dyadic polynomial domain
+ * @param active_region_data optional specification of active region of execution trace
  */
 template <typename Flavor, typename GrandProdRelation>
 void compute_grand_product(typename Flavor::ProverPolynomials& full_polynomials,
                            bb::RelationParameters<typename Flavor::FF>& relation_parameters,
                            size_t size_override = 0,
-                           std::vector<std::pair<size_t, size_t>> active_block_ranges = {})
+                           const ActiveRegionData& active_region_data = ActiveRegionData{})
 {
     PROFILE_THIS_NAME("compute_grand_product");
 
     using FF = typename Flavor::FF;
     using Polynomial = typename Flavor::Polynomial;
     using Accumulator = std::tuple_element_t<0, typename GrandProdRelation::SumcheckArrayOfValuesOverSubrelations>;
 
+    const bool has_active_ranges = active_region_data.size() > 0;
+
     // Set the domain over which the grand product must be computed. This may be less than the dyadic circuit size, e.g
     // the permutation grand product does not need to be computed beyond the index of the last active wire
     size_t domain_size = size_override == 0 ? full_polynomials.get_polynomial_size() : size_override;
 
-    const size_t num_threads = domain_size >= get_num_cpus_pow2() ? get_num_cpus_pow2() : 1;
-    const size_t block_size = domain_size / num_threads;
-    const size_t final_idx = domain_size - 1;
-
-    // Cumpute the index bounds for each thread for reuse in the computations below
-    std::vector<std::pair<size_t, size_t>> idx_bounds;
-    idx_bounds.reserve(num_threads);
-    for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
-        const size_t start = thread_idx * block_size;
-        const size_t end = (thread_idx == num_threads - 1) ? final_idx : (thread_idx + 1) * block_size;
-        idx_bounds.push_back(std::make_pair(start, end));
-    }
+    // Returns the ith active index if specified, otherwise acts as the identity map on the input
+    auto get_active_range_poly_idx = [&](size_t i) { return has_active_ranges ? active_region_data.get_idx(i) : i; };
+
+    size_t active_domain_size = has_active_ranges ? active_region_data.size() : domain_size;
+
+    // The size of the iteration domain is one less than the number of active rows since the final value of the
+    // grand product is constructed only in the relation and not explicitly in the polynomial
+    const MultithreadData active_range_thread_data = calculate_thread_data(active_domain_size - 1);
 
     // Allocate numerator/denominator polynomials that will serve as scratch space
     // TODO(zac) we can re-use the permutation polynomial as the numerator polynomial. Reduces readability
-    Polynomial numerator{ domain_size, domain_size };
-    Polynomial denominator{ domain_size, domain_size };
-
-    auto check_is_active = [&](size_t idx) {
-        if (active_block_ranges.empty()) {
-            return true;
-        }
-        return std::any_of(active_block_ranges.begin(), active_block_ranges.end(), [idx](const auto& range) {
-            return idx >= range.first && idx < range.second;
-        });
-    };
+    Polynomial numerator{ active_domain_size };
+    Polynomial denominator{ active_domain_size };
 
     // Step (1)
     // Populate `numerator` and `denominator` with the algebra described by Relation
-    FF gamma_fourth = relation_parameters.gamma.pow(4);
-    parallel_for(num_threads, [&](size_t thread_idx) {
+    parallel_for(active_range_thread_data.num_threads, [&](size_t thread_idx) {
+        const size_t start = active_range_thread_data.start[thread_idx];
+        const size_t end = active_range_thread_data.end[thread_idx];
         typename Flavor::AllValues row;
-        const size_t start = idx_bounds[thread_idx].first;
-        const size_t end = idx_bounds[thread_idx].second;
         for (size_t i = start; i < end; ++i) {
-            if (check_is_active(i)) {
-                // TODO(https://github.com/AztecProtocol/barretenberg/issues/940):consider avoiding get_row if possible.
-                row = full_polynomials.get_row(i);
-                numerator.at(i) =
-                    GrandProdRelation::template compute_grand_product_numerator<Accumulator>(row, relation_parameters);
-                denominator.at(i) = GrandProdRelation::template compute_grand_product_denominator<Accumulator>(
-                    row, relation_parameters);
+            // TODO(https://github.com/AztecProtocol/barretenberg/issues/940):consider avoiding get_row if possible.
+            auto row_idx = get_active_range_poly_idx(i);
+            if constexpr (IsUltraFlavor<Flavor>) {
+                row = full_polynomials.get_row_for_permutation_arg(row_idx);
             } else {
-                numerator.at(i) = gamma_fourth;
-                denominator.at(i) = gamma_fourth;
+                row = full_polynomials.get_row(row_idx);
             }
+            numerator.at(i) =
+                GrandProdRelation::template compute_grand_product_numerator<Accumulator>(row, relation_parameters);
+            denominator.at(i) =
+                GrandProdRelation::template compute_grand_product_denominator<Accumulator>(row, relation_parameters);
         }
     });
 
@@ -133,12 +129,12 @@ void compute_grand_product(typename Flavor::ProverPolynomials& full_polynomials,
     // (ii)  Take partial products P = { 1, a0a1, a2a3, a4a5 }
     // (iii) Each thread j computes N[i][j]*P[j]=
     //      {{a0,a0a1},{a0a1a2,a0a1a2a3},{a0a1a2a3a4,a0a1a2a3a4a5},{a0a1a2a3a4a5a6,a0a1a2a3a4a5a6a7}}
-    std::vector<FF> partial_numerators(num_threads);
-    std::vector<FF> partial_denominators(num_threads);
+    std::vector<FF> partial_numerators(active_range_thread_data.num_threads);
+    std::vector<FF> partial_denominators(active_range_thread_data.num_threads);
 
-    parallel_for(num_threads, [&](size_t thread_idx) {
-        const size_t start = idx_bounds[thread_idx].first;
-        const size_t end = idx_bounds[thread_idx].second;
+    parallel_for(active_range_thread_data.num_threads, [&](size_t thread_idx) {
+        const size_t start = active_range_thread_data.start[thread_idx];
+        const size_t end = active_range_thread_data.end[thread_idx];
         for (size_t i = start; i < end - 1; ++i) {
             numerator.at(i + 1) *= numerator[i];
             denominator.at(i + 1) *= denominator[i];
@@ -150,9 +146,9 @@ void compute_grand_product(typename Flavor::ProverPolynomials& full_polynomials,
     DEBUG_LOG_ALL(partial_numerators);
     DEBUG_LOG_ALL(partial_denominators);
 
-    parallel_for(num_threads, [&](size_t thread_idx) {
-        const size_t start = idx_bounds[thread_idx].first;
-        const size_t end = idx_bounds[thread_idx].second;
+    parallel_for(active_range_thread_data.num_threads, [&](size_t thread_idx) {
+        const size_t start = active_range_thread_data.start[thread_idx];
+        const size_t end = active_range_thread_data.end[thread_idx];
         if (thread_idx > 0) {
             FF numerator_scaling = 1;
             FF denominator_scaling = 1;
@@ -179,14 +175,44 @@ void compute_grand_product(typename Flavor::ProverPolynomials& full_polynomials,
     // We have a 'virtual' 0 at the start (as this is a to-be-shifted polynomial)
     ASSERT(grand_product_polynomial.start_index() == 1);
 
-    parallel_for(num_threads, [&](size_t thread_idx) {
-        const size_t start = idx_bounds[thread_idx].first;
-        const size_t end = idx_bounds[thread_idx].second;
+    // For Ultra/Mega, the first row is an inactive zero row thus the grand prod takes value 1 at both i = 0 and i = 1
+    if constexpr (IsUltraFlavor<Flavor>) {
+        grand_product_polynomial.at(1) = 1;
+    }
+
+    // Compute grand product values corresponding only to the active regions of the trace
+    parallel_for(active_range_thread_data.num_threads, [&](size_t thread_idx) {
+        const size_t start = active_range_thread_data.start[thread_idx];
+        const size_t end = active_range_thread_data.end[thread_idx];
         for (size_t i = start; i < end; ++i) {
-            grand_product_polynomial.at(i + 1) = numerator[i] * denominator[i];
+            const auto poly_idx = get_active_range_poly_idx(i + 1);
+            grand_product_polynomial.at(poly_idx) = numerator[i] * denominator[i];
         }
     });
 
+    // Final step: If active/inactive regions have been specified, the value of the grand product in the inactive
+    // regions have not yet been set. The polynomial takes an already computed constant value across each inactive
+    // region (since no copy constraints are present there) equal to the value of the grand product at the first index
+    // of the subsequent active region.
+    if (has_active_ranges) {
+        MultithreadData full_domain_thread_data = calculate_thread_data(domain_size);
+        parallel_for(full_domain_thread_data.num_threads, [&](size_t thread_idx) {
+            const size_t start = full_domain_thread_data.start[thread_idx];
+            const size_t end = full_domain_thread_data.end[thread_idx];
+            for (size_t i = start; i < end; ++i) {
+                for (size_t j = 0; j < active_region_data.num_ranges() - 1; ++j) {
+                    const size_t previous_range_end = active_region_data.get_range(j).second;
+                    const size_t next_range_start = active_region_data.get_range(j + 1).first;
+                    // Set the value of the polynomial if the index falls in an inactive region
+                    if (i >= previous_range_end && i < next_range_start) {
+                        grand_product_polynomial.at(i) = grand_product_polynomial[next_range_start];
+                        break;
+                    }
+                }
+            }
+        });
+    }
+
     DEBUG_LOG_ALL(grand_product_polynomial.coeffs());
 }