Skip to content

Commit

Permalink
Support 32bit and unsigned indices in bruteforce KNN (#730)
Browse files Browse the repository at this point in the history
Make KNN compilable with `IdxT == uint32_t` and add some benchmarks.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: #730
  • Loading branch information
achirkin authored Jul 5, 2022
1 parent fba595d commit 8aae19f
Show file tree
Hide file tree
Showing 8 changed files with 432 additions and 19 deletions.
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
bench/random/permute.cu
bench/random/rng.cu
bench/spatial/fused_l2_nn.cu
bench/spatial/knn.cu
bench/spatial/selection.cu
bench/main.cpp
)
Expand Down
20 changes: 13 additions & 7 deletions cpp/bench/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct using_pool_memory_res {
private:
rmm::mr::device_memory_resource* orig_res_;
rmm::mr::cuda_memory_resource cuda_res_;
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_res_;
rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_res_;

public:
using_pool_memory_res(size_t initial_size, size_t max_size)
Expand Down Expand Up @@ -115,13 +115,20 @@ class fixture {
int device_id = 0;
RAFT_CUDA_TRY(cudaGetDevice(&device_id));
RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id));
scratch_buf_ = rmm::device_buffer(l2_cache_size, stream);
scratch_buf_ = rmm::device_buffer(l2_cache_size * 3, stream);
}

// every benchmark should be overriding this
virtual void run_benchmark(::benchmark::State& state) = 0;
virtual void generate_metrics(::benchmark::State& state) {}

protected:
/** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache. */
void flush_L2_cache()
{
RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
}

/**
* The helper to be used inside `run_benchmark`, to loop over the state and record time using the
* cuda_event_timer.
Expand All @@ -130,9 +137,7 @@ class fixture {
void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true)
{
for (auto _ : state) {
if (flush_L2) {
RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
}
if (flush_L2) { flush_L2_cache(); }
cuda_event_timer timer(state, stream);
benchmark_func();
}
Expand All @@ -147,9 +152,9 @@ class Fixture : public ::benchmark::Fixture {

public:
explicit Fixture(const std::string name, const Params&... params)
: ::benchmark::Fixture(), params_(params...)
: ::benchmark::Fixture(), params_(params...), name_(name)
{
SetName(name.c_str());
SetName(name_.c_str());
}
Fixture() = delete;

Expand All @@ -165,6 +170,7 @@ class Fixture : public ::benchmark::Fixture {
private:
std::unique_ptr<Class> fixture_;
std::tuple<Params...> params_;
const std::string name_;

protected:
void BenchmarkCase(State& state) override
Expand Down
Loading

0 comments on commit 8aae19f

Please sign in to comment.