diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commit.bench.cpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commit.bench.cpp index 33cfce2fe10f..e67248502919 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commit.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commit.bench.cpp @@ -1,5 +1,6 @@ #include "barretenberg/commitment_schemes/commitment_key.hpp" +#include "barretenberg/ecc//batched_affine_addition/batched_affine_addition.hpp" #include "barretenberg/polynomials/polynomial.hpp" #include "barretenberg/srs/factories/mem_bn254_crs_factory.hpp" #include @@ -27,6 +28,59 @@ template Polynomial sparse_random_poly(const size_t size, cons return polynomial; } +template struct PolyData { + Polynomial polynomial; + std::vector> active_range_endpoints; +}; + +// Generate a polynomial with random coefficients organized in isolated blocks. (Mimics the wire polynomials +// in the structured trace setting, or z_perm if non_zero_complement is set to true). +template PolyData structured_random_poly(bool non_zero_complement = false) +{ + // An arbitrary but realistic test case taken from the actual structure of a wire in the client_ivc bench + std::vector fixed_sizes = { + 1 << 10, 1 << 7, 201000, 90000, 9000, 137000, 72000, 1 << 7, 2500, 11500, + }; + std::vector actual_sizes = { + 10, 16, 48873, 18209, 4132, 23556, 35443, 3, 2, 2, + }; + + uint32_t full_size = 0; + for (auto size : fixed_sizes) { + full_size += size; + } + + // In practice the polynomials will have a power-of-2 size + auto log2_n = static_cast(numeric::get_msb(full_size)); + if ((1UL << log2_n) != (full_size)) { + ++log2_n; + } + full_size = 1 << log2_n; + + // Construct a polynomial with the prescribed structure; track the "active" regions + auto polynomial = Polynomial(full_size); + uint32_t start_idx = 0; + uint32_t end_idx = 0; + std::vector> active_range_endpoints; + for (auto [block_size, actual_size] : zip_view(fixed_sizes, actual_sizes)) { + end_idx = start_idx + actual_size; + for (size_t i = start_idx; i < end_idx; ++i) { + polynomial.at(i) = FF::random_element(); + } + active_range_endpoints.emplace_back(start_idx, end_idx); + start_idx += block_size; + // If indicated, populate the active region complement with a random constant (mimicking z_perm) + if (non_zero_complement) { + FF const_random_coeff = FF::random_element(); + for (size_t i = end_idx; i < start_idx; ++i) { + polynomial.at(i) = const_random_coeff; + } + } + } + + return { polynomial, active_range_endpoints }; +} + constexpr size_t MIN_LOG_NUM_POINTS = 16; constexpr size_t MAX_LOG_NUM_POINTS = 20; constexpr size_t MAX_NUM_POINTS = 1 << MAX_LOG_NUM_POINTS; @@ -126,6 +180,7 @@ template void bench_commit_random(::benchmark::State& state) key->commit(polynomial); } } + // Commit to a polynomial with dense random nonzero entries but NOT our happiest case of an exact power of 2 // Note this used to be a 50% regression just subtracting a power of 2 by 1. template void bench_commit_random_non_power_of_2(::benchmark::State& state) @@ -139,6 +194,59 @@ template void bench_commit_random_non_power_of_2(::benchmark::S key->commit(polynomial); } } + +// Commit to a polynomial with block structured random entries using the basic commit method +template void bench_commit_structured_random_poly(::benchmark::State& state) +{ + using Fr = typename Curve::ScalarField; + auto key = create_commitment_key(MAX_NUM_POINTS); + + auto [polynomial, active_range_endpoints] = structured_random_poly(); + + for (auto _ : state) { + key->commit(polynomial); + } +} + +// Commit to a polynomial with block structured random entries using commit_structured +template void bench_commit_structured_random_poly_preprocessed(::benchmark::State& state) +{ + using Fr = typename Curve::ScalarField; + auto key = create_commitment_key(MAX_NUM_POINTS); + + auto [polynomial, active_range_endpoints] = structured_random_poly(); + + for (auto _ : state) { + key->commit_structured(polynomial, active_range_endpoints); + } +} + +// Commit to a polynomial with block structured random entries and constant valued complement +template void bench_commit_mock_z_perm(::benchmark::State& state) +{ + using Fr = typename Curve::ScalarField; + auto key = create_commitment_key(MAX_NUM_POINTS); + + auto [polynomial, active_range_endpoints] = structured_random_poly(/*non_zero_complement=*/true); + + for (auto _ : state) { + key->commit(polynomial); + } +} + +// Commit to a polynomial with block structured random entries and constant valued complement using tailored method +template void bench_commit_mock_z_perm_preprocessed(::benchmark::State& state) +{ + using Fr = typename Curve::ScalarField; + auto key = create_commitment_key(MAX_NUM_POINTS); + + auto [polynomial, active_range_endpoints] = structured_random_poly(/*non_zero_complement=*/true); + + for (auto _ : state) { + key->commit_structured_with_nonzero_complement(polynomial, active_range_endpoints); + } +} + BENCHMARK(bench_commit_zero) ->DenseRange(MIN_LOG_NUM_POINTS, MAX_LOG_NUM_POINTS) ->Unit(benchmark::kMillisecond); @@ -160,6 +268,10 @@ BENCHMARK(bench_commit_random) BENCHMARK(bench_commit_random_non_power_of_2) ->DenseRange(MIN_LOG_NUM_POINTS, MAX_LOG_NUM_POINTS) ->Unit(benchmark::kMillisecond); +BENCHMARK(bench_commit_structured_random_poly)->Unit(benchmark::kMillisecond); +BENCHMARK(bench_commit_structured_random_poly_preprocessed)->Unit(benchmark::kMillisecond); +BENCHMARK(bench_commit_mock_z_perm)->Unit(benchmark::kMillisecond); +BENCHMARK(bench_commit_mock_z_perm_preprocessed)->Unit(benchmark::kMillisecond); } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp index c713e58be37f..afbc584020bb 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp @@ -9,7 +9,9 @@ #include "barretenberg/common/debug_log.hpp" #include "barretenberg/common/op_count.hpp" +#include "barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/scalar_multiplication/sorted_msm.hpp" #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/numeric/bitop/pow.hpp" #include "barretenberg/polynomials/polynomial.hpp" @@ -164,7 +166,7 @@ template class CommitmentKey { std::vector scalars; std::vector points; scalars.reserve(num_nonzero_scalars); - points.reserve(num_nonzero_scalars); + points.reserve(2 * num_nonzero_scalars); // 2x accounts for endomorphism points for (size_t idx = 0; idx < num_threads; ++idx) { scalars.insert(scalars.end(), thread_scalars[idx].begin(), thread_scalars[idx].end()); points.insert(points.end(), thread_points[idx].begin(), thread_points[idx].end()); @@ -173,6 +175,148 @@ template class CommitmentKey { // Call the version of pippenger which assumes all points are distinct return scalar_multiplication::pippenger_unsafe(scalars, points, pippenger_runtime_state); } + + /** + * @brief Efficiently commit to a polynomial whose nonzero elements are arranged in discrete blocks + * @details Given a set of ranges where the polynomial takes non-zero values, copy the non-zero inputs (scalars, + * points) into contiguous memory and commit to them using the normal pippenger algorithm. Defaults to the + * conventional commit method if the number of non-zero entries is beyond a threshold relative to the full + * polynomial size. + * @note The wire polynomials have the described form when a structured execution trace is in use. + * @warning Method makes a copy of all {point, scalar} pairs that comprise the reduced input. May not be efficient + * in terms of memory or computation for polynomials beyond a certain sparseness threshold. + * + * @param polynomial + * @param active_ranges + * @return Commitment + */ + Commitment commit_structured(PolynomialSpan polynomial, + const std::vector>& active_ranges) + { + BB_OP_COUNT_TIME(); + ASSERT(polynomial.end_index() <= srs->get_monomial_size()); + + // Percentage of nonzero coefficients beyond which we resort to the conventional commit method + constexpr size_t NONZERO_THRESHOLD = 75; + + size_t total_num_scalars = 0; + for (const auto& range : active_ranges) { + total_num_scalars += range.second - range.first; + } + + // Compute "active" percentage of polynomial; resort to standard commit if appropriate + size_t percentage_nonzero = total_num_scalars * 100 / polynomial.size(); + if (percentage_nonzero > NONZERO_THRESHOLD) { + return commit(polynomial); + } + + // Extract the precomputed point table (contains raw SRS points at even indices and the corresponding + // endomorphism point (\beta*x, -y) at odd indices). + std::span point_table = srs->get_monomial_points(); + + std::vector scalars; + scalars.reserve(total_num_scalars); + for (const auto& range : active_ranges) { + auto start = &polynomial[range.first]; + auto end = &polynomial[range.second]; + scalars.insert(scalars.end(), start, end); + } + std::vector points; + points.reserve(total_num_scalars * 2); + for (const auto& range : active_ranges) { + auto start = &point_table[2 * range.first]; + auto end = &point_table[2 * range.second]; + points.insert(points.end(), start, end); + } + + // Call pippenger + return scalar_multiplication::pippenger_unsafe(scalars, points, pippenger_runtime_state); + } + + /** + * @brief Efficiently commit to a polynomial with discrete blocks of arbitrary elements and constant elements + * @details Similar to method commit_structured() except the complement to the "active" region cantains non-zero + * constant values (which are assumed to differ between blocks). This is exactly the structure of the permutation + * grand product polynomial z_perm when a structured execution trace is in use. + * @warning Requires a copy of all {point, scalar} pairs (including endo points) corresponding to the primary blocks + * and a copy of all of the points (without endo points) corresponding to their complement. + * + * @param polynomial + * @param active_ranges + * @return Commitment + */ + Commitment commit_structured_with_nonzero_complement(PolynomialSpan polynomial, + const std::vector>& active_ranges) + { + BB_OP_COUNT_TIME(); + ASSERT(polynomial.end_index() <= srs->get_monomial_size()); + + using BatchedAddition = BatchedAffineAddition; + + // Percentage of constant coefficients below which we resort to the conventional commit method + constexpr size_t CONSTANT_THRESHOLD = 50; + + // Compute the active range complement over which the polynomial is assumed to be constant within each range + std::vector> active_ranges_complement; + for (size_t i = 0; i < active_ranges.size() - 1; ++i) { + const size_t start = active_ranges[i].second; + const size_t end = active_ranges[i + 1].first; + active_ranges_complement.emplace_back(start, end); + } + // Final complement range goes from end of last active range to the end of the polynomial + active_ranges_complement.emplace_back(active_ranges.back().second, polynomial.end_index()); + + // Compute the total number of scalars in the constant regions + size_t total_num_complement_scalars = 0; + for (const auto& range : active_ranges_complement) { + total_num_complement_scalars += range.second - range.first; + } + + // Compute percentage of polynomial comprised of constant blocks; resort to standard commit if appropriate + size_t percentage_constant = total_num_complement_scalars * 100 / polynomial.size(); + if (percentage_constant < CONSTANT_THRESHOLD) { + return commit(polynomial); + } + + // Extract the precomputed point table (contains raw SRS points at even indices and the corresponding + // endomorphism point (\beta*x, -y) at odd indices). + std::span point_table = srs->get_monomial_points(); + + // Copy the raw SRS points (no endo points) corresponding to the constant regions into contiguous memory + // TODO(https://github.com/AztecProtocol/barretenberg/issues/1131): Peak memory usage could be improved by + // performing this copy and the subsequent summation as a precomputation prior to constructing the point table. + std::vector points; + points.reserve(2 * total_num_complement_scalars); + for (const auto& range : active_ranges_complement) { + const size_t start = 2 * range.first; + const size_t end = 2 * range.second; + for (size_t i = start; i < end; i += 2) { + points.emplace_back(point_table[i]); + } + } + + // Populate the set of unique scalars with first coeff from each range (values assumed constant over each + // range). Also store the number of points in each sequence to be summed + std::vector unique_scalars; + std::vector sequence_counts; + for (const auto& range : active_ranges_complement) { + if (range.second - range.first > 0) { // only ranges with nonzero length + unique_scalars.emplace_back(polynomial.span[range.first]); + sequence_counts.emplace_back(range.second - range.first); + } + } + + // Reduce each sequence to a single point + auto reduced_points = BatchedAddition::add_in_place(points, sequence_counts); + + // Compute the full commitment as the sum of the "active" region commitment and the constant region contribution + Commitment result = commit_structured(polynomial, active_ranges); + for (auto [scalar, point] : zip_view(unique_scalars, reduced_points)) { + result = result + point * scalar; + } + + return result; + } }; } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/sparse_commitment.test.cpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/sparse_commitment.test.cpp index 87473bee3d5b..877f0f6b5fae 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/sparse_commitment.test.cpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/sparse_commitment.test.cpp @@ -1,4 +1,5 @@ #include "barretenberg/commitment_schemes/commitment_key.hpp" +#include "barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp" #include "barretenberg/polynomials/polynomial.hpp" #include "barretenberg/srs/factories/file_crs_factory.hpp" @@ -13,8 +14,63 @@ template class CommitmentKeyTest : public ::testing::Test { using Commitment = typename Curve::AffineElement; using Polynomial = bb::Polynomial; + struct StructuredPolyData { + Polynomial polynomial; + std::vector> active_range_endpoints; + }; + public: template inline std::shared_ptr create_commitment_key(size_t num_points); + + // Construct a random poly with the prescribed block structure; complement is zero/constant if non_zero_complement = + // false/true (to mimic wire/z_perm) + StructuredPolyData create_structured_test_polynomial(std::vector fixed_sizes, + std::vector actual_sizes, + bool non_zero_complement = false) + { + // Add zero row offset to mimic actual structure of wire/z_perm + const size_t ZERO_ROW_OFFSET = 1; + + uint32_t full_size = ZERO_ROW_OFFSET; + for (auto size : fixed_sizes) { + full_size += size; + } + // In practice the polynomials will have a power-of-2 size + auto log2_n = static_cast(numeric::get_msb(full_size)); + if ((1UL << log2_n) != (full_size)) { + ++log2_n; + } + full_size = 1 << log2_n; + + // Construct polynomial with the specified form and track active range endpoints + Polynomial polynomial(full_size - 1, full_size, 1); + uint32_t start_idx = ZERO_ROW_OFFSET; + uint32_t end_idx = 0; + std::vector> active_range_endpoints; + for (auto [fixed_size, actual_size] : zip_view(fixed_sizes, actual_sizes)) { + end_idx = start_idx + actual_size; + active_range_endpoints.emplace_back(start_idx, end_idx); + for (size_t idx = start_idx; idx < end_idx; ++idx) { + polynomial.at(idx) = Fr::random_element(); + } + start_idx += fixed_size; + if (non_zero_complement) { // fill complement with random constant value + Fr const_val = Fr::random_element(); + for (size_t idx = end_idx; idx < start_idx; ++idx) { + polynomial.at(idx) = const_val; + } + } + } + // fill complement region between end of last fixed block and end of polynomial + if (non_zero_complement) { + Fr const_val = polynomial[active_range_endpoints.back().second]; + for (size_t i = active_range_endpoints.back().second; i < polynomial.end_index(); ++i) { + polynomial.at(i) = const_val; + } + } + + return { polynomial, active_range_endpoints }; + } }; template <> @@ -156,4 +212,61 @@ TYPED_TEST(CommitmentKeyTest, CommitSparseMediumNonZeroStartIndex) EXPECT_EQ(sparse_commit_result, commit_result); } +/** + * @brief Test commit_structured on polynomial with blocks of non-zero values (like wires when using structured trace) + * + */ +TYPED_TEST(CommitmentKeyTest, CommitStructuredWire) +{ + using Curve = TypeParam; + using CK = CommitmentKey; + using G1 = Curve::AffineElement; + + // Arbitrary but realistic block structure in the ivc setting (roughly 2^19 full size with 2^17 utlization) + std::vector fixed_sizes = { 1000, 4000, 180000, 90000, 9000, 137000, 72000, 4000, 2500, 11500 }; + std::vector actual_sizes = { 10, 16, 48873, 18209, 4132, 23556, 35443, 3, 2, 2 }; + + // Construct a random polynomial resembling the wires in the structured trace setting + const bool non_zero_complement = false; + auto [polynomial, active_range_endpoints] = + TestFixture::create_structured_test_polynomial(fixed_sizes, actual_sizes, non_zero_complement); + + // Commit to the polynomial using both the conventional commit method and the sparse commitment method + auto key = TestFixture::template create_commitment_key(polynomial.virtual_size()); + + G1 expected_result = key->commit(polynomial); + G1 result = key->commit_structured(polynomial, active_range_endpoints); + + EXPECT_EQ(result, expected_result); +} + +/** + * @brief Test the method for committing to structured polynomials with a constant nonzero complement (i.e. the + * permutation grand product polynomial z_perm in the structured trace setting). + * + */ +TYPED_TEST(CommitmentKeyTest, CommitStructuredNonzeroComplement) +{ + using Curve = TypeParam; + using CK = CommitmentKey; + using G1 = Curve::AffineElement; + + // Arbitrary but realistic block structure in the ivc setting (roughly 2^19 full size with 2^17 utlization) + std::vector fixed_sizes = { 1000, 4000, 180000, 90000, 9000, 137000, 72000, 4000, 2500, 11500 }; + std::vector actual_sizes = { 10, 16, 48873, 18209, 4132, 23556, 35443, 3, 2, 2 }; + + // Construct a random polynomial resembling z_perm in the structured trace setting + const bool non_zero_complement = true; + auto [polynomial, active_range_endpoints] = + TestFixture::create_structured_test_polynomial(fixed_sizes, actual_sizes, non_zero_complement); + + // Commit to the polynomial using both the conventional commit method and the sparse commitment method + auto key = TestFixture::template create_commitment_key(polynomial.virtual_size()); + + G1 expected_result = key->commit(polynomial); + G1 result = key->commit_structured_with_nonzero_complement(polynomial, active_range_endpoints); + + EXPECT_EQ(result, expected_result); +} + } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.cpp b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.cpp new file mode 100644 index 000000000000..e00069a91a39 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.cpp @@ -0,0 +1,238 @@ +#include "barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp" +#include "barretenberg/common/zip_view.hpp" +#include +#include +#include + +namespace bb { + +template +std::vector::G1> BatchedAffineAddition::add_in_place( + const std::span& points, const std::vector& sequence_counts) +{ + // Instantiate scratch space for point addition denominators and their calculation + std::vector scratch_space_vector(points.size()); + std::span scratch_space(scratch_space_vector); + + // Divide the work into groups of addition sequences to be reduced by each thread + auto [addition_sequences, sequence_tags] = construct_thread_data(points, sequence_counts, scratch_space); + + const size_t num_threads = addition_sequences.size(); + parallel_for(num_threads, [&](size_t thread_idx) { batched_affine_add_in_place(addition_sequences[thread_idx]); }); + + // Construct a vector of the reduced points, accounting for sequences that may have been split across threads + std::vector reduced_points; + size_t prev_tag = std::numeric_limits::max(); + for (auto [sequences, tags] : zip_view(addition_sequences, sequence_tags)) { + // Extract the first num-sequence-counts many points from each add sequence + for (size_t i = 0; i < sequences.sequence_counts.size(); ++i) { + if (tags[i] == prev_tag) { + reduced_points.back() = reduced_points.back() + sequences.points[i]; + } else { + reduced_points.emplace_back(sequences.points[i]); + } + prev_tag = tags[i]; + } + } + + return reduced_points; +} + +template +typename BatchedAffineAddition::ThreadData BatchedAffineAddition::construct_thread_data( + const std::span& points, const std::vector& sequence_counts, const std::span& scratch_space) +{ + // Compute the endpoints of the sequences within the points array from the sequence counts + std::vector sequence_endpoints; + size_t total_count = 0; + for (const auto& count : sequence_counts) { + total_count += count; + sequence_endpoints.emplace_back(total_count); + } + + if (points.size() != total_count) { + info("Number of input points does not match sequence counts!"); + ASSERT(false); + } + + // Determine the optimal number of threads for parallelization + const size_t MIN_POINTS_PER_THREAD = 1 << 14; // heuristic; anecdotally optimal for practical cases + const size_t total_num_points = points.size(); + const size_t optimal_threads = total_num_points / MIN_POINTS_PER_THREAD; + const size_t num_threads = std::max(1UL, std::min(get_num_cpus(), optimal_threads)); + // Distribute the work as evenly as possible across threads + const size_t base_thread_size = total_num_points / num_threads; + const size_t leftover_size = total_num_points % num_threads; + std::vector thread_sizes(num_threads, base_thread_size); + for (size_t i = 0; i < leftover_size; ++i) { + thread_sizes[i]++; + } + + // Construct the point spans for each thread according to the distribution determined above + std::vector> thread_points; + std::vector> thread_scratch_space; + std::vector thread_endpoints; + size_t point_index = 0; + for (auto size : thread_sizes) { + thread_points.push_back(points.subspan(point_index, size)); + thread_scratch_space.push_back(scratch_space.subspan(point_index, size)); + point_index += size; + thread_endpoints.emplace_back(point_index); + } + + // Construct the union of the thread and sequence endpoints by combining, sorting, then removing duplicates. This is + // used to break the points into sequences for each thread while tracking tags so that sequences split across one of + // more threads can be properly reconstructed. + std::vector all_endpoints; + all_endpoints.reserve(thread_endpoints.size() + sequence_endpoints.size()); + all_endpoints.insert(all_endpoints.end(), thread_endpoints.begin(), thread_endpoints.end()); + all_endpoints.insert(all_endpoints.end(), sequence_endpoints.begin(), sequence_endpoints.end()); + std::sort(all_endpoints.begin(), all_endpoints.end()); + auto last = std::unique(all_endpoints.begin(), all_endpoints.end()); + all_endpoints.erase(last, all_endpoints.end()); + + // Construct sequence counts and tags for each thread using the set of all thread and sequence endpoints + size_t prev_endpoint = 0; + size_t thread_idx = 0; + size_t sequence_idx = 0; + std::vector> thread_sequence_counts(num_threads); + std::vector> thread_sequence_tags(num_threads); + for (auto& endpoint : all_endpoints) { + size_t chunk_size = endpoint - prev_endpoint; + thread_sequence_counts[thread_idx].emplace_back(chunk_size); + thread_sequence_tags[thread_idx].emplace_back(sequence_idx); + if (endpoint == thread_endpoints[thread_idx]) { + thread_idx++; + } + if (endpoint == sequence_endpoints[sequence_idx]) { + sequence_idx++; + } + prev_endpoint = endpoint; + } + + if (thread_sequence_counts.size() != thread_points.size()) { + info("Mismatch in sequence count construction!"); + ASSERT(false); + } + + // Construct the addition sequences for each thread + std::vector addition_sequences; + for (size_t i = 0; i < num_threads; ++i) { + addition_sequences.emplace_back(thread_sequence_counts[i], thread_points[i], thread_scratch_space[i]); + } + + return { addition_sequences, thread_sequence_tags }; +} + +template +std::span::Fq> BatchedAffineAddition< + Curve>::batch_compute_point_addition_slope_inverses(const AdditionSequences& add_sequences) +{ + auto points = add_sequences.points; + auto sequence_counts = add_sequences.sequence_counts; + + // Count the total number of point pairs to be added across all addition sequences + size_t total_num_pairs{ 0 }; + for (auto& count : sequence_counts) { + total_num_pairs += count >> 1; + } + + // Define scratch space for batched inverse computations and eventual storage of denominators + ASSERT(add_sequences.scratch_space.size() >= 2 * total_num_pairs); + std::span denominators = add_sequences.scratch_space.subspan(0, total_num_pairs); + std::span differences = add_sequences.scratch_space.subspan(total_num_pairs, 2 * total_num_pairs); + + // Compute and store successive products of differences (x_2 - x_1) + Fq accumulator = 1; + size_t point_idx = 0; + size_t pair_idx = 0; + for (auto& count : sequence_counts) { + const auto num_pairs = count >> 1; + for (size_t j = 0; j < num_pairs; ++j) { + ASSERT(pair_idx < total_num_pairs); + const auto& x1 = points[point_idx++].x; + const auto& x2 = points[point_idx++].x; + + // It is assumed that the input points are random and thus w/h/p do not share an x-coordinate + ASSERT(x1 != x2); + + auto diff = x2 - x1; + differences[pair_idx] = diff; + + // Store and update the running product of differences at each stage + denominators[pair_idx++] = accumulator; + accumulator *= diff; + } + // If number of points in the sequence is odd, we skip the last one since it has no pair + point_idx += (count & 0x01ULL); + } + + // Invert the full product of differences + Fq inverse = accumulator.invert(); + + // Compute the individual point-pair addition denominators 1/(x2 - x1) + for (size_t i = 0; i < total_num_pairs; ++i) { + size_t idx = total_num_pairs - 1 - i; + denominators[idx] *= inverse; + inverse *= differences[idx]; + } + + return denominators; +} + +template +void BatchedAffineAddition::batched_affine_add_in_place(AdditionSequences add_sequences) +{ + const size_t num_points = add_sequences.points.size(); + if (num_points == 0 || num_points == 1) { // nothing to do + return; + } + + // Batch compute terms of the form 1/(x2 -x1) for each pair to be added in this round + std::span denominators = batch_compute_point_addition_slope_inverses(add_sequences); + + auto points = add_sequences.points; + auto sequence_counts = add_sequences.sequence_counts; + + // Compute pairwise in-place additions for all sequences with more than 1 point + size_t point_idx = 0; // index for points to be summed + size_t result_point_idx = 0; // index for result points + size_t pair_idx = 0; // index into array of denominators for each pair + bool more_additions = false; + for (auto& count : sequence_counts) { + const auto num_pairs = count >> 1; + const bool overflow = static_cast(count & 0x01ULL); + // Compute the sum of all pairs in the sequence and store the result in the same points array + for (size_t j = 0; j < num_pairs; ++j) { + const auto& point_1 = points[point_idx++]; // first summand + const auto& point_2 = points[point_idx++]; // second summand + const auto& denominator = denominators[pair_idx++]; // denominator needed in add formula + auto& result = points[result_point_idx++]; // target for addition result + + result = affine_add_with_denominator(point_1, point_2, denominator); + } + // If the sequence had an odd number of points, simply carry the unpaired point over to the next round + if (overflow) { + points[result_point_idx++] = points[point_idx++]; + } + + // Update the sequence counts in place for the next round + const uint32_t updated_sequence_count = static_cast(num_pairs) + static_cast(overflow); + count = updated_sequence_count; + + // More additions are required if any sequence has not yet been reduced to a single point + more_additions = more_additions || updated_sequence_count > 1; + } + + // Recursively perform pairwise additions until all sequences have been reduced to a single point + if (more_additions) { + const size_t updated_point_count = result_point_idx; + std::span updated_points(&points[0], updated_point_count); + return batched_affine_add_in_place( + AdditionSequences{ sequence_counts, updated_points, add_sequences.scratch_space }); + } +} + +template class BatchedAffineAddition; +template class BatchedAffineAddition; +} // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp new file mode 100644 index 000000000000..22aa50869818 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "barretenberg/ecc/curves/bn254/bn254.hpp" +#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" +#include +#include + +namespace bb { + +/** + * @brief Class for handling fast batched affine addition of large sets of EC points + * @brief Useful for pre-reducing the SRS points via summation for commitments to polynomials with large ranges of + * constant coefficients. + * + * @tparam Curve + */ +template class BatchedAffineAddition { + using G1 = typename Curve::AffineElement; + using Fr = typename Curve::ScalarField; + using Fq = typename Curve::BaseField; + + // Struct describing a set of points to be reduced to num-sequence-counts-many points via summation of each sequence + struct AdditionSequences { + std::vector sequence_counts; + std::span points; + std::span scratch_space; + }; + + // Collection of addition sequences to be handled by each thread + struct ThreadData { + std::vector addition_sequences; + std::vector> sequence_tags; // allows for the recombining of sequences split across threads + }; + + public: + /** + * @brief Given a set of points and sequence counts, peform addition to reduce each sequence to a single point + * @details Reduce each sequence to a single point via repeated rounds of pairwise addition. (If the length + * of the sequence is odd in a given round, the unpaired point is simply carried over to the next round). The + * inverses needed in the addition formula are batch computed in a single go for all additions to be performed + * across all sequences in a given round. + * @note: Multithreading is achieved by evenly distributing the points across the optimal number of available + * threads. This can result in the bisecting of some sequences which is acounted for in the final result by further + * summing the reduced points that resulted from a sequence split across two or more threads. An example with two + * threads and three add sequences: + * + * |------------------------| Points + * |------------|-----------| Thread boundaries + * |---|-----------|--------| Addition sequence boundaries + * |---|--------|---|-------| New addition sequence boundaries + * | 0 | 1 | 1 | 2 | Tags + * + * Each thread recieves two add sequences and reduces them to two points. The resulting four points are further + * reduced to three by summing points that share a sequence tag. + * + * @param points Set of points to be reduced in place to num-sequences many points + * @param sequence_counts lengths of the individual sequences to be summed (assumed continguous in points) + * @return std::vector the set of reduced points in contiguous memory + */ + static std::vector add_in_place(const std::span& points, const std::vector& sequence_counts); + + private: + /** + * @brief Construct the set of AdditionSequences to be handled by each thread + * @details To optimize thread utilization, points are distributed evenly across the number of available threads. + * This may in general result in the splitting of individual addition sequences across two or more threads. This is + * accounted for by assigning a tag to each sequence so that the results can be further combined post-facto to + * ensure that the final number of points corresponds to the number of addition sequences. + * + * @param points + * @param sequence_counts + * @param scratch_space Space for computing and storing the point addition slope denominators + * @return ThreadData + */ + static ThreadData construct_thread_data(const std::span& points, + const std::vector& sequence_counts, + const std::span& scratch_space); + + /** + * @brief Batch compute inverses needed for a set of affine point addition sequences + * @details Addition of points P_1, P_2 requires computation of a term of the form 1/(P_2.x - P_1.x). For + * efficiency, these terms are computed all at once for a full set of addition sequences using batch inversion. + * + * @tparam Curve + * @param add_sequences + */ + static std::span batch_compute_point_addition_slope_inverses(const AdditionSequences& add_sequences); + + /** + * @brief Internal method for in-place summation of a single set of addition sequences + * + * @tparam Curve + * @param addition_sequences Set of points and counts indicating number of points in each addition chain + */ + static void batched_affine_add_in_place(AdditionSequences add_sequences); + + /** + * @brief Add two affine elements with the inverse in the slope term \lambda provided as input + * @details The sum of two points (x1, y1), (x2, y2) is given by x3 = \lambda^2 - x1 - x2, y3 = \lambda*(x1 - x3) - + * y1, where \lambda = (y2 - y1)/(x2 - x1). When performing many additions at once, it is more efficient to batch + * compute the inverse component of \lambda for each pair of points. This gives rise to the need for a method like + * this one. + * + * @tparam Curve + * @param point_1 (x1, y1) + * @param point_2 (x2, y2) + * @param denominator 1/(x2 - x1) + * @return Curve::AffineElement + */ + static inline G1 affine_add_with_denominator(const G1& point_1, const G1& point_2, const Fq& denominator) + { + const auto& x1 = point_1.x; + const auto& y1 = point_1.y; + const auto& x2 = point_2.x; + const auto& y2 = point_2.y; + + const Fq lambda = denominator * (y2 - y1); + Fq x3 = lambda.sqr() - x2 - x1; + Fq y3 = lambda * (x1 - x3) - y1; + return { x3, y3 }; + } +}; + +} // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.test.cpp new file mode 100644 index 000000000000..5f8ac4e81764 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/batched_affine_addition/batched_affine_addition.test.cpp @@ -0,0 +1,68 @@ +#include "barretenberg/ecc/batched_affine_addition/batched_affine_addition.hpp" +#include "barretenberg/common/mem.hpp" +#include "barretenberg/common/test.hpp" +#include "barretenberg/common/zip_view.hpp" +#include "barretenberg/numeric/random/engine.hpp" +#include "barretenberg/srs/factories/file_crs_factory.hpp" +#include "barretenberg/srs/io.hpp" + +#include +#include + +namespace bb { + +namespace { +auto& engine = numeric::get_debug_randomness(); +} + +template class BatchedAffineAdditionTests : public ::testing::Test { + + public: + using G1 = typename Curve::AffineElement; + using Fr = typename Curve::ScalarField; +}; + +using Curves = ::testing::Types; + +TYPED_TEST_SUITE(BatchedAffineAdditionTests, Curves); + +// Test the method for summing one or more large sequences of affine EC points in place +TYPED_TEST(BatchedAffineAdditionTests, Reduce) +{ + using Curve = TypeParam; + using G1 = Curve::AffineElement; + using BatchedAddition = BatchedAffineAddition; + + // Construct a single array of random points containing 5 sequences to be summed + const size_t num_sequences = 5; + const size_t sequence_size = 1 << 10; + const size_t input_size = num_sequences * sequence_size; + std::vector sequence_counts(num_sequences, sequence_size); + + // Extract raw SRS points from point point table points + std::vector points; + points.reserve(input_size); + for (size_t i = 0; i < input_size; ++i) { + points.emplace_back(G1::random_element()); + } + + // Manually sum the points in each sequence to get the expected num-sequences-many reduced points + std::vector expected_reduced_points; + size_t point_idx = 0; + for (size_t i = 0; i < num_sequences; ++i) { + G1 sum = G1::infinity(); + for (size_t j = 0; j < sequence_size; ++j) { + sum = sum + points[point_idx++]; + } + expected_reduced_points.push_back(sum); + } + + // Reduce the points using the optimized method + auto reduced_points = BatchedAddition::add_in_place(points, sequence_counts); + + // Check agreement of the reduced points + for (auto [result, expected] : zip_view(reduced_points, expected_reduced_points)) { + EXPECT_EQ(result, expected); + } +} +} // namespace bb \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/sorted_msm.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/sorted_msm.hpp index 5022cfd3836e..b131d3678551 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/sorted_msm.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/sorted_msm.hpp @@ -8,6 +8,11 @@ namespace bb { +// TODO(https://github.com/AztecProtocol/barretenberg/issues/1130): This class is largely superceded by the newer +// BatchedAffineAddition which contains improved batched affine addition logic (but not the sorting logic). The +// sort-then-add-then-mul strategy of this class does not have an obvious use case in our current protocols. This class +// could be updated to use the BatchedAffineAddition class to remove duplication or it could be removed altogether. + /** * @brief Reduce MSM inputs such that the set of scalars contains no duplicates by summing points which share a scalar. * diff --git a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp index 82e9def06ae9..0459f89faa5d 100644 --- a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp +++ b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp @@ -105,6 +105,11 @@ typename ExecutionTrace_::TraceData ExecutionTrace_::construct_t for (auto& block : builder.blocks.get()) { auto block_size = static_cast(block.size()); + // Save ranges over which the blocks are "active" for use in structured commitments + if constexpr (IsHonkFlavor) { + proving_key.active_block_ranges.emplace_back(offset, offset + block.size()); + } + // Update wire polynomials and copy cycles // NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code { diff --git a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp index dba48dd60765..235468ee6cfa 100644 --- a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp @@ -123,6 +123,9 @@ template class ProvingKey_ { // folded element by element. std::vector public_inputs; + // Ranges over which the execution trace is "active" + std::vector> active_block_ranges; + ProvingKey_() = default; ProvingKey_(const size_t dyadic_circuit_size, const size_t num_public_inputs, diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp index ec9c14d40845..e372f0f6dea3 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp @@ -21,8 +21,16 @@ template struct PolynomialSpan { size_t end_index() const { return start_index + size(); } Fr* data() { return span.data(); } size_t size() const { return span.size(); } - Fr& operator[](size_t index) { return span[index - start_index]; } - const Fr& operator[](size_t index) const { return span[index - start_index]; } + Fr& operator[](size_t index) + { + ASSERT(index >= start_index && index < end_index()); + return span[index - start_index]; + } + const Fr& operator[](size_t index) const + { + ASSERT(index >= start_index && index < end_index()); + return span[index - start_index]; + } }; /** diff --git a/barretenberg/cpp/src/barretenberg/transcript/origin_tag.hpp b/barretenberg/cpp/src/barretenberg/transcript/origin_tag.hpp index a0d8b5f30f85..da12a1b35c79 100644 --- a/barretenberg/cpp/src/barretenberg/transcript/origin_tag.hpp +++ b/barretenberg/cpp/src/barretenberg/transcript/origin_tag.hpp @@ -14,7 +14,7 @@ #define STANDARD_TESTING_TAGS /*Tags reused in tests*/ \ const size_t parent_id = 0; \ - const auto clear_tag = OriginTag(); \ + [[maybe_unused]] const auto clear_tag = OriginTag(); \ const auto submitted_value_origin_tag = OriginTag( \ parent_id, /*round_id=*/0, /*is_submitted=*/true); /*A tag describing a value submitted in the 0th round*/ \ const auto next_submitted_value_origin_tag = OriginTag( \ diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp index 25762158e612..e2054e2a5400 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp @@ -32,6 +32,9 @@ template class DeciderProvingKey_ { using Trace = ExecutionTrace_; + // Flag indicating whether the polynomials will be constructed with fixed block sizes for each gate type + bool is_structured; + public: ProvingKey proving_key; @@ -45,15 +48,13 @@ template class DeciderProvingKey_ { DeciderProvingKey_(Circuit& circuit, TraceStructure trace_structure = TraceStructure::NONE, std::shared_ptr commitment_key = nullptr) + : is_structured(trace_structure != TraceStructure::NONE) { BB_OP_COUNT_TIME_NAME("DeciderProvingKey(Circuit&)"); vinfo("creating decider proving key"); circuit.finalize_circuit(/*ensure_nonzero=*/true); - // Set flag indicating whether the polynomials will be constructed with fixed block sizes for each gate type - const bool is_structured = (trace_structure != TraceStructure::NONE); - // If using a structured trace, set fixed block sizes, check their validity, and set the dyadic circuit size if (is_structured) { circuit.blocks.set_fixed_block_sizes(trace_structure); // set the fixed sizes for each block @@ -314,6 +315,8 @@ template class DeciderProvingKey_ { DeciderProvingKey_() = default; ~DeciderProvingKey_() = default; + bool get_is_structured() { return is_structured; } + private: static constexpr size_t num_zero_rows = Flavor::has_zero_row ? 1 : 0; static constexpr size_t NUM_WIRES = Circuit::NUM_WIRES; diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp index 71cb8f8426af..092d2dfc73df 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp @@ -101,12 +101,21 @@ template void OinkProver::execute_wire_commitment // We only commit to the fourth wire polynomial after adding memory recordss { BB_OP_COUNT_TIME_NAME("COMMIT::wires"); - witness_commitments.w_l = - proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_l); - witness_commitments.w_r = - proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_r); - witness_commitments.w_o = - proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_o); + if (proving_key->get_is_structured()) { + witness_commitments.w_l = proving_key->proving_key.commitment_key->commit_structured( + proving_key->proving_key.polynomials.w_l, proving_key->proving_key.active_block_ranges); + witness_commitments.w_r = proving_key->proving_key.commitment_key->commit_structured( + proving_key->proving_key.polynomials.w_r, proving_key->proving_key.active_block_ranges); + witness_commitments.w_o = proving_key->proving_key.commitment_key->commit_structured( + proving_key->proving_key.polynomials.w_o, proving_key->proving_key.active_block_ranges); + } else { + witness_commitments.w_l = + proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_l); + witness_commitments.w_r = + proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_r); + witness_commitments.w_o = + proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_o); + } } auto wire_comms = witness_commitments.get_wires(); @@ -168,8 +177,13 @@ template void OinkProver::execute_sorted_list_acc } { BB_OP_COUNT_TIME_NAME("COMMIT::wires"); - witness_commitments.w_4 = - proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_4); + if (proving_key->get_is_structured()) { + witness_commitments.w_4 = proving_key->proving_key.commitment_key->commit_structured( + proving_key->proving_key.polynomials.w_4, proving_key->proving_key.active_block_ranges); + } else { + witness_commitments.w_4 = + proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.w_4); + } } transcript->send_to_verifier(domain_separator + commitment_labels.lookup_read_counts, @@ -228,8 +242,14 @@ template void OinkProver::execute_grand_product_c { BB_OP_COUNT_TIME_NAME("COMMIT::z_perm"); - witness_commitments.z_perm = - proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.z_perm); + if (proving_key->get_is_structured()) { + witness_commitments.z_perm = + proving_key->proving_key.commitment_key->commit_structured_with_nonzero_complement( + proving_key->proving_key.polynomials.z_perm, proving_key->proving_key.active_block_ranges); + } else { + witness_commitments.z_perm = + proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.z_perm); + } } transcript->send_to_verifier(domain_separator + commitment_labels.z_perm, witness_commitments.z_perm); }