From 3b82be0f266c838c823bbe26cfea99337d7180a9 Mon Sep 17 00:00:00 2001 From: Innokentii Sennovskii Date: Fri, 19 Jan 2024 19:07:56 +0000 Subject: [PATCH] chore: Remove mutex dependency (#4160) Removes mutex in IPA opening. Adds a function which can split loop into parallel threads and give them indices for working with memory structures --- .../commitment_schemes/ipa/ipa.hpp | 24 ++++-- .../cpp/src/barretenberg/common/thread.cpp | 44 ++++++++--- .../cpp/src/barretenberg/common/thread.hpp | 73 ++++++++++++++++--- 3 files changed, 112 insertions(+), 29 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 9a254f34740..3a874566602 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -43,7 +43,6 @@ template class IPA { transcript->send_to_verifier("IPA:poly_degree", static_cast(poly_degree)); const Fr generator_challenge = transcript->get_challenge("IPA:generator_challenge"); auto aux_generator = Commitment::one() * generator_challenge; - // Checks poly_degree is greater than zero and a power of two // In the future, we might want to consider if non-powers of two are needed ASSERT((poly_degree > 0) && (!(poly_degree & (poly_degree - 1))) && @@ -90,30 +89,39 @@ template class IPA { std::vector R_elements(log_poly_degree); std::size_t round_size = poly_degree; + // Allocate vectors for parallel storage of partial products + const size_t num_cpus = get_num_cpus(); + std::vector partial_inner_prod_L(num_cpus); + std::vector partial_inner_prod_R(num_cpus); // Perform IPA rounds for (size_t i = 0; i < log_poly_degree; i++) { round_size >>= 1; + // Set partial products to zero + memset(&partial_inner_prod_L[0], 0, sizeof(Fr) * num_cpus); + memset(&partial_inner_prod_R[0], 0, sizeof(Fr) * num_cpus); // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo > - std::mutex addition_lock; Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); // Run scalar product in parallel - run_loop_in_parallel_if_effective( + run_loop_in_parallel_if_effective_with_index( round_size, - [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) { + [&a_vec, &b_vec, round_size, &partial_inner_prod_L, &partial_inner_prod_R]( + size_t start, size_t end, size_t workload_index) { Fr current_inner_prod_L = Fr::zero(); Fr current_inner_prod_R = Fr::zero(); for (size_t j = start; j < end; j++) { current_inner_prod_L += a_vec[j] * b_vec[round_size + j]; current_inner_prod_R += a_vec[round_size + j] * b_vec[j]; } - addition_lock.lock(); - inner_prod_L += current_inner_prod_L; - inner_prod_R += current_inner_prod_R; - addition_lock.unlock(); + partial_inner_prod_L[workload_index] = current_inner_prod_L; + partial_inner_prod_R[workload_index] = current_inner_prod_R; }, /*finite_field_additions_per_iteration=*/2, /*finite_field_multiplications_per_iteration=*/2); + for (size_t j = 0; j < num_cpus; j++) { + inner_prod_L += partial_inner_prod_L[j]; + inner_prod_R += partial_inner_prod_R[j]; + } // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator L_elements[i] = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points( diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index b90a44ae146..12334d272c6 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -136,6 +136,8 @@ void run_loop_in_parallel(size_t num_points, * @param num_points Total number of elements * @param func A function or lambda expression with a for loop inside, for example: * [](size_t start, size_t end){for (size_t i=start; i& func, - size_t finite_field_additions_per_iteration, - size_t finite_field_multiplications_per_iteration, - size_t finite_field_inversions_per_iteration, - size_t group_element_additions_per_iteration, - size_t group_element_doublings_per_iteration, - size_t scalar_multiplications_per_iteration, - size_t sequential_copy_ops_per_iteration) +template + requires(std::is_same_v> || + std::is_same_v>) +void run_loop_in_parallel_if_effective_internal(size_t num_points, + const FunctionType& func, + size_t finite_field_additions_per_iteration, + size_t finite_field_multiplications_per_iteration, + size_t finite_field_inversions_per_iteration, + size_t group_element_additions_per_iteration, + size_t group_element_doublings_per_iteration, + size_t scalar_multiplications_per_iteration, + size_t sequential_copy_ops_per_iteration) { // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds): constexpr size_t FF_ADDITION_COST = 4; @@ -185,7 +190,12 @@ void run_loop_in_parallel_if_effective(size_t num_points, // If starting parallel for is longer than computing, just compute if (offset_cost < PARALLEL_FOR_COST) { - func(0, num_points); + if constexpr (std::is_same_v>) { + + func(0, num_points); + } else { + func(0, num_points, 0); + } return; } // Parallelize over chunks @@ -201,6 +211,16 @@ void run_loop_in_parallel_if_effective(size_t num_points, } size_t start = chunk_index * chunk_size; size_t end = chunk_index * chunk_size + current_chunk_size; - func(start, end); + + if constexpr (std::is_same_v>) { + + func(start, end); + } else { + func(start, end, chunk_index); + } }); -}; \ No newline at end of file +}; +template void run_loop_in_parallel_if_effective_internal( + size_t, const std::function&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); +template void run_loop_in_parallel_if_effective_internal( + size_t, const std::function&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index 787f0313d9e..8c67ca84d0a 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -26,12 +26,67 @@ void parallel_for(size_t num_iterations, const std::function& func void run_loop_in_parallel(size_t num_points, const std::function& func, size_t no_multhreading_if_less_or_equal = 0); -void run_loop_in_parallel_if_effective(size_t num_points, - const std::function& func, - size_t finite_field_additions_per_iteration = 0, - size_t finite_field_multiplications_per_iteration = 0, - size_t finite_field_inversions_per_iteration = 0, - size_t group_element_additions_per_iteration = 0, - size_t group_element_doublings_per_iteration = 0, - size_t scalar_multiplications_per_iteration = 0, - size_t sequential_copy_ops_per_iteration = 0); \ No newline at end of file + +template + requires(std::is_same_v> || + std::is_same_v>) +void run_loop_in_parallel_if_effective_internal( + size_t, const FunctionType&, size_t, size_t, size_t, size_t, size_t, size_t, size_t); +/** + * @brief Runs loop in parallel if parallelization if useful (costs less than the algorith) + * + * @details Please see run_loop_in_parallel_if_effective_internal for detailed description + * + */ +inline void run_loop_in_parallel_if_effective(size_t num_points, + const std::function& func, + size_t finite_field_additions_per_iteration = 0, + size_t finite_field_multiplications_per_iteration = 0, + size_t finite_field_inversions_per_iteration = 0, + size_t group_element_additions_per_iteration = 0, + size_t group_element_doublings_per_iteration = 0, + size_t scalar_multiplications_per_iteration = 0, + size_t sequential_copy_ops_per_iteration = 0 + +) +{ + run_loop_in_parallel_if_effective_internal(num_points, + func, + finite_field_additions_per_iteration, + finite_field_multiplications_per_iteration, + finite_field_inversions_per_iteration, + group_element_additions_per_iteration, + group_element_doublings_per_iteration, + scalar_multiplications_per_iteration, + sequential_copy_ops_per_iteration); +} + +/** + * @brief Runs loop in parallel if parallelization if useful (costs less than the algorith). The loop function is given + * the index of the workload. + * + * @details Please see run_loop_in_parallel_if_effective_internal for detailed description + * + */ +inline void run_loop_in_parallel_if_effective_with_index(size_t num_points, + const std::function& func, + size_t finite_field_additions_per_iteration = 0, + size_t finite_field_multiplications_per_iteration = 0, + size_t finite_field_inversions_per_iteration = 0, + size_t group_element_additions_per_iteration = 0, + size_t group_element_doublings_per_iteration = 0, + size_t scalar_multiplications_per_iteration = 0, + size_t sequential_copy_ops_per_iteration = 0 + +) +{ + run_loop_in_parallel_if_effective_internal(num_points, + func, + finite_field_additions_per_iteration, + finite_field_multiplications_per_iteration, + finite_field_inversions_per_iteration, + group_element_additions_per_iteration, + group_element_doublings_per_iteration, + scalar_multiplications_per_iteration, + sequential_copy_ops_per_iteration); +} \ No newline at end of file