From 9ba9854b0e93a6cf31a1075cc3f1631d5afb1b7b Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 9 Aug 2023 13:03:17 +0200 Subject: [PATCH] Add nvtx annotations to benchmarks --- cpp/bench/ann/CMakeLists.txt | 2 +- cpp/bench/ann/src/common/benchmark.hpp | 52 ++++++++++++--------- cpp/bench/ann/src/common/util.hpp | 62 ++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 23 deletions(-) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 4689bb2eda..5e31e71b06 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -222,7 +222,7 @@ target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECT target_link_libraries( ANN_BENCH PRIVATE nlohmann_json::nlohmann_json benchmark_static dl -static-libgcc - -static-libstdc++ + -static-libstdc++ CUDA::nvtx3 ) set_target_properties( ANN_BENCH diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 29d852f280..97f23838a9 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -129,12 +129,16 @@ void bench_build(::benchmark::State& state, std::size_t index_size = dataset->base_set_size(); cuda_timer gpu_timer; - for (auto _ : state) { - auto gpu_lap = gpu_timer.lap(); - try { - algo->build(base_set, index_size, gpu_timer.stream()); - } catch (const std::exception& e) { - state.SkipWithError(std::string(e.what())); + { + nvtx_case nvtx{state.name()}; + for (auto _ : state) { + auto ntx_lap = nvtx.lap(); + auto gpu_lap = gpu_timer.lap(); + try { + algo->build(base_set, index_size, gpu_timer.stream()); + } catch (const std::exception& e) { + state.SkipWithError(std::string(e.what())); + } } } state.counters.insert( @@ -206,23 +210,27 @@ void bench_search(::benchmark::State& state, std::ptrdiff_t batch_offset = 0; std::size_t queries_processed = 0; cuda_timer gpu_timer; - for (auto _ : state) { - // measure the GPU time using the RAII helper - auto gpu_lap = gpu_timer.lap(); - // run the search - try { - algo->search(query_set + batch_offset * dataset->dim(), - n_queries, - k, - neighbors.data + batch_offset * k, - distances.data + batch_offset * k, - gpu_timer.stream()); - } catch (const std::exception& e) { - state.SkipWithError(std::string(e.what())); + { + nvtx_case nvtx{state.name()}; + for (auto _ : state) { + // measure the GPU time using the RAII helper + auto ntx_lap = nvtx.lap(); + auto gpu_lap = gpu_timer.lap(); + // run the search + try { + algo->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors.data + batch_offset * k, + distances.data + batch_offset * k, + gpu_timer.stream()); + } catch (const std::exception& e) { + state.SkipWithError(std::string(e.what())); + } + // advance to the next batch + batch_offset = (batch_offset + n_queries) % query_set_size; + queries_processed += n_queries; } - // advance to the next batch - batch_offset = (batch_offset + n_queries) % query_set_size; - queries_processed += n_queries; } state.SetItemsProcessed(queries_processed); state.counters.insert({{"k", k}, {"n_queries", n_queries}}); diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp index f713d0e9d0..88a9b4bb7a 100644 --- a/cpp/bench/ann/src/common/util.hpp +++ b/cpp/bench/ann/src/common/util.hpp @@ -18,6 +18,7 @@ #include "ann_types.hpp" #include "cuda_stub.hpp" +#include #include #include @@ -31,6 +32,7 @@ #include #include +#include namespace raft::bench::ann { @@ -161,6 +163,66 @@ inline auto cuda_info() return props; } +struct nvtx_case { + private: + std::string case_name_; + std::array iter_name_{0}; + nvtxDomainHandle_t domain_; + int64_t iteration_ = 0; + nvtxEventAttributes_t case_attrib_{0}; + nvtxEventAttributes_t iter_attrib_{0}; + + public: + struct nvtx_lap { + private: + nvtxDomainHandle_t domain_; + + public: + nvtx_lap(nvtxDomainHandle_t domain, nvtxEventAttributes_t* attr) : domain_(domain) + { + nvtxDomainRangePushEx(domain_, attr); + } + nvtx_lap() = delete; + ~nvtx_lap() noexcept { nvtxDomainRangePop(domain_); } + }; + + explicit nvtx_case(std::string case_name) + : case_name_(std::move(case_name)), domain_(nvtxDomainCreateA("ANN benchmark")) + { + case_attrib_.version = NVTX_VERSION; + iter_attrib_.version = NVTX_VERSION; + case_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + iter_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + case_attrib_.colorType = NVTX_COLOR_ARGB; + iter_attrib_.colorType = NVTX_COLOR_ARGB; + case_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + iter_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + case_attrib_.message.ascii = case_name_.c_str(); + auto c = std::hash{}(case_name_); + case_attrib_.color = c | 0xA0A0A0; + nvtxDomainRangePushEx(domain_, &case_attrib_); + } + + ~nvtx_case() + { + nvtxDomainRangePop(domain_); + nvtxDomainDestroy(domain_); + } + + [[nodiscard]] auto lap() -> nvtx_case::nvtx_lap + { + auto i = iteration_++; + uint32_t c = (i % 5); + uint32_t r = 150 + c * 20; + uint32_t g = 200 + c * 10; + uint32_t b = 220 + c * 5; + std::snprintf(iter_name_.data(), iter_name_.size(), "Lap %zd", i); + iter_attrib_.message.ascii = iter_name_.data(); + iter_attrib_.color = (r << 16) + (g << 8) + b; + return nvtx_lap{domain_, &iter_attrib_}; + } +}; + inline std::vector split(const std::string& s, char delimiter) { std::vector tokens;