Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: honk profiling by pass, tsan preset #2982

Merged
merged 28 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,18 @@
"SMT": "ON"
}
},
{
"name": "tsan",
"displayName": "Debugging build with thread sanitizer on Clang-16",
"description": "Build with thread sanitizer on clang16 with debugging information",
"inherits": "clang16-dbg",
"binaryDir": "build-tsan",
"environment": {
"CFLAGS": "-fsanitize=thread",
"CXXFLAGS": "-fsanitize=thread",
"LDFLAGS": "-fsanitize=thread"
}
},
{
"name": "coverage",
"displayName": "Build with coverage",
Expand Down Expand Up @@ -190,9 +202,9 @@
"generator": "Unix Makefiles",
"inherits": "clang16",
"environment": {
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100"
"CFLAGS": "-fxray-instrument",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=500 -DXRAY=1",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=500 -DXRAY=1"
},
"binaryDir": "build-xray"
},
Expand All @@ -202,9 +214,9 @@
"description": "Build with Clang and enable detailed LLVM XRay for profiling",
"inherits": "xray",
"environment": {
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150"
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150 -DXRAY=1",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150 -DXRAY=1",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150 -DXRAY=1"
},
"binaryDir": "build-xray-verbose"
},
Expand Down Expand Up @@ -276,6 +288,11 @@
"inherits": "clang16",
"configurePreset": "smt-verification"
},
{
"name": "tsan",
"inherits": "default",
"configurePreset": "tsan"
},
{
"name": "coverage",
"inherits": "default",
Expand Down
11 changes: 6 additions & 5 deletions barretenberg/cpp/scripts/collect_profile_information.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/bash
set -eu

PRESET=${1:-xray-1thread} # can also be 'xray'
PRESET=${1:-xray} # can also be 'xray-1thread'
ONLY_PROCESS=${2:-}
EXECUTABLE=${3:-ultra_honk_rounds_bench}

# Move above script dir.
cd $(dirname $0)/..
Expand All @@ -15,10 +16,10 @@ cd build-$PRESET

if [ -z "$ONLY_PROCESS" ]; then
# Clear old profile data.
rm -f xray-log.honk_bench_main_simple.*
rm -f xray-log.$EXECUTABLE.*

# Run benchmark with profiling.
XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1" ./bin/honk_bench_main_simple
XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1" ./bin/$EXECUTABLE
fi

function shorten_cpp_names() {
Expand All @@ -37,8 +38,8 @@ function shorten_cpp_names() {
}

# Process benchmark file.
llvm-xray-16 stack xray-log.honk_bench_main_simple.* \
--instr_map=./bin/honk_bench_main_simple --stack-format=flame --aggregate-threads --aggregation-type=time --all-stacks \
llvm-xray-16 stack xray-log.$EXECUTABLE.* \
--instr_map=./bin/$EXECUTABLE --stack-format=flame --aggregate-threads --aggregation-type=time --all-stacks \
| node ../scripts/llvm_xray_stack_flame_corrector.js \
| shorten_cpp_names \
| ../scripts/flamegraph.pl --width 1200 --fontsize 10 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
set(BENCHMARK_SOURCES
standard_plonk.bench.cpp
ultra_honk.bench.cpp
ultra_honk_rounds.bench.cpp
ultra_plonk.bench.cpp
)

Expand All @@ -19,17 +20,4 @@ foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
add_executable(${BENCHMARK_NAME}_bench main.bench.cpp ${BENCHMARK_SOURCE} benchmark_utilities.hpp)
target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
endforeach()

add_executable(
honk_bench_main_simple
main.simple.cpp
)

target_link_libraries(
honk_bench_main_simple
PRIVATE
stdlib_sha256
stdlib_keccak
stdlib_merkle_tree
)
endforeach()
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#pragma once
#include <benchmark/benchmark.h>
#include <cstddef>

#include "barretenberg/honk/composer/ultra_composer.hpp"
#include "barretenberg/honk/proof_system/ultra_prover.hpp"
#include "barretenberg/plonk/composer/ultra_composer.hpp"
#include "barretenberg/proof_system/types/circuit_type.hpp"
#include "barretenberg/stdlib/encryption/ecdsa/ecdsa.hpp"
#include "barretenberg/stdlib/hash/keccak/keccak.hpp"
Expand Down Expand Up @@ -203,6 +206,26 @@ void construct_proof_with_specified_num_gates(State& state,
}
}

inline proof_system::honk::UltraProver get_prover(
proof_system::honk::UltraComposer& composer,
void (*test_circuit_function)(proof_system::honk::UltraComposer::CircuitBuilder&, size_t),
size_t num_iterations)
{
proof_system::honk::UltraComposer::CircuitBuilder builder;
test_circuit_function(builder, num_iterations);
std::shared_ptr<proof_system::honk::UltraComposer::Instance> instance = composer.create_instance(builder);
return composer.create_prover(instance);
}

inline proof_system::plonk::UltraProver get_prover(
proof_system::plonk::UltraComposer& composer,
void (*test_circuit_function)(proof_system::honk::UltraComposer::CircuitBuilder&, size_t),
size_t num_iterations)
{
proof_system::plonk::UltraComposer::CircuitBuilder builder;
test_circuit_function(builder, num_iterations);
return composer.create_prover(builder);
}
/**
* @brief Performs proof constuction for benchmarks based on a provided circuit function
*
Expand All @@ -219,29 +242,18 @@ void construct_proof_with_specified_num_iterations(State& state,
size_t)) noexcept
{
barretenberg::srs::init_crs_factory("../srs_db/ignition");

Composer composer;

auto num_iterations = static_cast<size_t>(state.range(0));
for (auto _ : state) {
// Constuct circuit and prover; don't include this part in measurement
state.PauseTiming();
auto builder = typename Composer::CircuitBuilder();
test_circuit_function(builder, num_iterations);

auto composer = Composer();
if constexpr (proof_system::IsAnyOf<Composer, proof_system::honk::UltraComposer>) {
auto instance = composer.create_instance(builder);
auto ext_prover = composer.create_prover(instance);
state.ResumeTiming();

// Construct proof
auto proof = ext_prover.construct_proof();

} else {
auto ext_prover = composer.create_prover(builder);
state.ResumeTiming();
auto prover = get_prover(composer, test_circuit_function, num_iterations);
state.ResumeTiming();

// Construct proof
auto proof = ext_prover.construct_proof();
}
// Construct proof
auto proof = prover.construct_proof();
}
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <benchmark/benchmark.h>

#include "barretenberg/benchmark/honk_bench/benchmark_utilities.hpp"
#include "barretenberg/honk/composer/ultra_composer.hpp"
#include "barretenberg/honk/proof_system/ultra_prover.hpp"
#include "barretenberg/proof_system/circuit_builder/ultra_circuit_builder.hpp"

using namespace benchmark;
using namespace proof_system;

// The rounds to measure
enum { PREAMBLE, WIRE_COMMITMENTS, SORTED_LIST_ACCUMULATOR, GRAND_PRODUCT_COMPUTATION, RELATION_CHECK, ZEROMORPH };

/**
* @details Benchmark ultrahonk by performing all the rounds, but only measuring one.
* Note: As a result the very short rounds take a long time for statistical significance, so recommended to set their
* iterations to 1.
* @param state - The google benchmark state.
* @param prover - The ultrahonk prover.
* @param index - The pass to measure.
**/
BBERG_PROFILE static void test_round_inner(State& state, honk::UltraProver& prover, size_t index) noexcept
{
auto time_if_index = [&](size_t target_index, auto&& func) -> void {
if (index == target_index) {
state.ResumeTiming();
func();
state.PauseTiming();
} else {
func();
}
};
for (auto _ : state) {
state.PauseTiming();
time_if_index(PREAMBLE, [&] { prover.execute_preamble_round(); });
time_if_index(WIRE_COMMITMENTS, [&] { prover.execute_wire_commitments_round(); });
time_if_index(SORTED_LIST_ACCUMULATOR, [&] { prover.execute_sorted_list_accumulator_round(); });
time_if_index(GRAND_PRODUCT_COMPUTATION, [&] { prover.execute_grand_product_computation_round(); });
time_if_index(RELATION_CHECK, [&] { prover.execute_relation_check_rounds(); });
time_if_index(ZEROMORPH, [&] { prover.execute_zeromorph_rounds(); });
state.ResumeTiming();
}
}
BBERG_PROFILE static void test_round(State& state, size_t index) noexcept
{
barretenberg::srs::init_crs_factory("../srs_db/ignition");

honk::UltraComposer composer;
// TODO(AD) benchmark both sparse and dense circuits?
honk::UltraProver prover =
bench_utils::get_prover(composer, &bench_utils::generate_keccak_test_circuit<UltraCircuitBuilder>, 1);
test_round_inner(state, prover, index);
}
#define ROUND_BENCHMARK(round) \
static void ROUND_##round(State& state) noexcept \
{ \
test_round(state, round); \
} \
BENCHMARK(ROUND_##round)->Unit(::benchmark::kMillisecond)

// Fast rounds take a long time to benchmark because of how we compute statistical significance.
// Limit to one iteration so we don't spend a lot of time redoing full proofs just to measure this part.
ROUND_BENCHMARK(PREAMBLE)->Iterations(1);
ROUND_BENCHMARK(WIRE_COMMITMENTS)->Iterations(1);
ROUND_BENCHMARK(SORTED_LIST_ACCUMULATOR)->Iterations(1);
ROUND_BENCHMARK(GRAND_PRODUCT_COMPUTATION)->Iterations(1);
ROUND_BENCHMARK(RELATION_CHECK);
ROUND_BENCHMARK(ZEROMORPH);
14 changes: 6 additions & 8 deletions barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
#define BBERG_INLINE __attribute__((always_inline)) inline
#endif

// TODO(AD): Other compilers
#if defined(__clang__)
#define BBERG_INSTRUMENT [[clang::xray_always_instrument]]
#define BBERG_NO_INSTRUMENT [[clang::xray_never_instrument]]
#define BBERG_NOINLINE [[clang::noinline]]
// TODO(AD): Other instrumentation?
#ifdef XRAY
#define BBERG_PROFILE [[clang::xray_always_instrument]] [[clang::noinline]]
#define BBERG_NO_PROFILE [[clang::xray_never_instrument]]
#else
#define BBERG_INSTRUMENT
#define BBERG_NO_INSTRUMENT
#define BBERG_NOINLINE
#define BBERG_PROFILE
#define BBERG_NO_PROFILE
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class ThreadPool {
std::condition_variable complete_condition_;
bool stop = false;

BBERG_NO_INSTRUMENT void worker_loop(size_t thread_index);
BBERG_NO_PROFILE void worker_loop(size_t thread_index);

void do_iterations()
{
Expand Down
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
// parallel_for_queued(num_iterations, func);
#endif
#endif
}
}
3 changes: 2 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <barretenberg/env/hardware_concurrency.hpp>
#include <barretenberg/numeric/bitop/get_msb.hpp>
#include <functional>
#include <iostream>
#include <thread>
#include <vector>

Expand All @@ -21,4 +22,4 @@ inline size_t get_num_cpus_pow2()
return static_cast<size_t>(1ULL << numeric::get_msb(get_num_cpus()));
}

void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
Loading