Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(bb): IPA parallelization cleanup #8088

Merged
merged 12 commits into from
Aug 20, 2024
56 changes: 18 additions & 38 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
#include "barretenberg/commitment_schemes/claim.hpp"
#include "barretenberg/commitment_schemes/verification_key.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/container.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/transcript/transcript.hpp"
#include <cstddef>
#include <numeric>
#include <string>
#include <utility>
#include <vector>

namespace bb {
Expand Down Expand Up @@ -168,10 +170,8 @@ template <typename Curve_> class IPA {
// G_vec_local should use only the original SRS thus we extract only the even indices.
parallel_for_heuristic(
poly_length,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
[&](size_t i) {
G_vec_local[i] = srs_elements[i * 2];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol

}, thread_heuristics::FF_COPY_COST);

// Step 5.
Expand All @@ -196,38 +196,22 @@ template <typename Curve_> class IPA {
GroupElement R_i;
std::size_t round_size = poly_length;

#ifndef NO_MULTITHREADING
// The inner products we'll be computing in parallel need a mutex to be thread-safe during the last
// accumulation
std::mutex inner_product_accumulation_mutex;
#endif
// Step 6.
// Perform IPA reduction rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
round_size /= 2;
// Run scalar products in parallel
parallel_for_heuristic(
auto inner_prods = parallel_for_heuristic(
round_size,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
// Update the accumulated results thread-safely
{
#ifndef NO_MULTITHREADING
std::unique_lock<std::mutex> lock(inner_product_accumulation_mutex);
#endif
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
}
std::pair{Fr::zero(), Fr::zero()},
[&](size_t j, std::pair<Fr, Fr>& inner_prod_left_right) {
// Compute inner_prod_L := < a_vec_lo, b_vec_hi >
inner_prod_left_right.first += a_vec[j] * b_vec[round_size + j];
// Compute inner_prod_R := < a_vec_hi, b_vec_lo >
inner_prod_left_right.second += a_vec[round_size + j] * b_vec[j];
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);

// Sum inner product contributions computed in parallel and unpack the std::pair
auto [inner_prod_L, inner_prod_R] = sum_pairs(inner_prods);
// Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_i = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
Expand Down Expand Up @@ -376,23 +360,21 @@ template <typename Curve_> class IPA {

// Step 7.
// Construct vector s
std::vector<Fr> s_vec(poly_length);
std::vector<Fr> s_vec(poly_length, Fr::one());

// TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
// O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
// leaving it unoptimized for now.
parallel_for_heuristic(
poly_length,
[&](size_t i) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
s_vec[i] *= round_challenges_inv[log_poly_degree - 1 - j];
}
}
s_vec[i] = s_vec_scalar;
}, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);

auto* srs_elements = vk->get_monomial_points();
Expand All @@ -405,10 +387,8 @@ template <typename Curve_> class IPA {
// G_vec_local should use only the original SRS thus we extract only the even indices.
parallel_for_heuristic(
poly_length,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
[&](size_t i) {
G_vec_local[i] = srs_elements[i * 2];
}, thread_heuristics::FF_COPY_COST * 2);

// Step 8.
Expand Down
2 changes: 2 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@
#define BB_UNLIKELY(x) x
#endif

// Opinionated feature: functionally equivalent to [[maybe_unused]] but clearly
// marks things DEFINITELY unused. Aims to be more readable, at the tradeoff of being a custom thingy.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx

#define BB_UNUSED [[maybe_unused]]
29 changes: 28 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/container.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <string>
#include <vector>

namespace bb {

template <typename C> C slice(C const& container, size_t start)
{
auto b = container.begin();
Expand Down Expand Up @@ -61,4 +63,29 @@ template <typename T> int64_t index_of(std::vector<T> const& vec, T const& item)
auto const& itr = std::find(begin, end, item);

return itr == end ? -1 : std::distance(begin, itr);
}
}

// A simple sum meant for small containers (i.e. doesn't use threading)
template <template <typename, typename...> typename Cont, typename Inner, typename... Args>
Inner sum(Cont<Inner, Args...> const& in)
{
Inner result{};
for (auto& e : in) {
result += e;
}
return result;
}

// A simple sum meant for small containers (i.e. doesn't use threading)
template <template <typename, typename...> typename Cont, typename Left, typename Right, typename... Args>
std::pair<Left, Right> sum_pairs(Cont<std::pair<Left, Right>, Args...> const& in)
{
std::pair<Left, Right> result{ {}, {} };
for (auto& e : in) {
result.first += e.first;
result.second += e.second;
}
return result;
}

} // namespace bb
28 changes: 28 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,32 @@ void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristi
heuristic_cost);
}

/**
* @brief parallel_for_heuristic variant that takes an accumulator initializer
* that is allocated in a vector, one accumulator per thread/chunk.
* This allows for thread-safe accumulation, see sum() or sum_pairs() in container.hpp
* for an easy way to combine the thread/chunk contributions into a final result.
*/
template <typename Func, typename Accum>
requires std::invocable<Func, std::size_t, Accum&>
std::vector<Accum> parallel_for_heuristic(size_t num_points,
const Accum& initial_accum,
const Func& func,
size_t heuristic_cost)
{
// thread-safe accumulators
std::vector<Accum> accumulators(get_num_cpus(), initial_accum);
parallel_for_heuristic(
num_points,
[&](size_t start_idx, size_t end_idx, size_t chunk_index) {
for (size_t i = start_idx; i < end_idx; i++) {
func(i, accumulators[chunk_index]);
}
},
heuristic_cost);
return accumulators;
}

const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;

/**
Expand Down Expand Up @@ -100,6 +126,8 @@ constexpr size_t GE_DOUBLING_COST = 194;
constexpr size_t SM_COST = 50000;
// Field element (16 byte) sequential copy number
constexpr size_t FF_COPY_COST = 3;
// Fine default if something looks 'chunky enough that I don't want to calculate'
constexpr size_t ALWAYS_MULTITHREAD = 100000;
} // namespace thread_heuristics

} // namespace bb
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#pragma once
#include "barretenberg/common/container.hpp"
#include "barretenberg/common/op_count.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/flavor/flavor.hpp"
#include "barretenberg/ultra_honk/oink_prover.hpp"
#include "protogalaxy_prover.hpp"
Expand All @@ -10,16 +13,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
const RelationSeparator& alpha,
const RelationParameters<FF>& relation_parameters)
{
BB_OP_COUNT_TIME_NAME("ProtoGalaxyProver_::compute_full_honk_evaluations");
auto instance_size = instance_polynomials.get_polynomial_size();
std::vector<FF> full_honk_evaluations(instance_size);
std::vector<FF> linearly_dependent_contributions(instance_size);
#ifndef NO_MULTITHREADING
std::mutex evaluation_mutex;
#endif
auto linearly_dependent_contribution_accumulator = FF(0);
parallel_for_range(instance_size, [&](size_t start_row, size_t end_row) {
auto thread_accumulator = FF(0);
for (size_t row = start_row; row < end_row; row++) {
std::vector<FF> linearly_dependent_contribution_accumulators = parallel_for_heuristic(
instance_size,
/*accumulator default*/ FF(0),
[&](size_t row, FF& linearly_dependent_contribution_accumulator) {
auto row_evaluations = instance_polynomials.get_row(row);
RelationEvaluations relation_evaluations;
Utils::zero_elements(relation_evaluations);
Expand All @@ -29,19 +29,13 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver

auto output = FF(0);
auto running_challenge = FF(1);
auto linearly_dependent_contribution = FF(0);
Utils::scale_and_batch_elements(
relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution);
thread_accumulator += linearly_dependent_contribution;
relation_evaluations, alpha, running_challenge, output, linearly_dependent_contribution_accumulator);

full_honk_evaluations[row] = output;
}
#ifndef NO_MULTITHREADING
std::unique_lock<std::mutex> evaluation_lock(evaluation_mutex);
#endif
linearly_dependent_contribution_accumulator += thread_accumulator;
});
full_honk_evaluations[0] += linearly_dependent_contribution_accumulator;
},
thread_heuristics::ALWAYS_MULTITHREAD);
full_honk_evaluations[0] += sum(linearly_dependent_contribution_accumulators);
return full_honk_evaluations;
}

Expand All @@ -59,20 +53,18 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver

auto degree = level + 1;
auto prev_level_width = prev_level_coeffs.size();
std::vector<std::vector<FF>> level_coeffs(prev_level_width >> 1, std::vector<FF>(degree + 1, 0));
parallel_for_range(
prev_level_width >> 1,
[&](size_t start, size_t end) {
for (size_t node = start << 1; node < end << 1; node += 2) {
auto parent = node >> 1;
std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
std::vector<std::vector<FF>> level_coeffs(prev_level_width / 2, std::vector<FF>(degree + 1, 0));
parallel_for_heuristic(
prev_level_width / 2,
[&](size_t parent) {
size_t node = parent * 2;
std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
},
/*no_multhreading_if_less_or_equal=*/8);
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3);
return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1);
}

Expand All @@ -84,14 +76,15 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
const std::vector<FF>& full_honk_evaluations)
{
auto width = full_honk_evaluations.size();
std::vector<std::vector<FF>> first_level_coeffs(width >> 1, std::vector<FF>(2, 0));
parallel_for_range(width >> 1, [&](size_t start, size_t end) {
for (size_t node = start << 1; node < end << 1; node += 2) {
auto parent = node >> 1;
std::vector<std::vector<FF>> first_level_coeffs(width / 2, std::vector<FF>(2, 0));
parallel_for_heuristic(
width / 2,
[&](size_t parent) {
size_t node = parent * 2;
first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];
}
});
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3);
return construct_coefficients_tree(betas, deltas, first_level_coeffs);
}

Expand Down
Loading