From 51c45b0a880f02410046019a1a588dece8ec731a Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 14 Dec 2022 22:10:49 +0100 Subject: [PATCH] Speedup `make_blobs` by up to 2x by fixing inefficient kernel launch configuration (#1100) The kernel generates two elements per iteration and attempts to write the second element with an offset equal to the grid stride. However, the grid stride is currently computed to be greater than the length of the generated array, so this second value is never used. By using a grid stride of half the array size, we speed up the kernel by nearly 2x in some cases (see perf charts in the PR comments). _Note: this will effectively modify many test inputs, so be aware of that when comparing results prior to and following the change._ Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1100 --- cpp/bench/random/make_blobs.cu | 10 ++++++++++ cpp/include/raft/random/detail/make_blobs.cuh | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu index fdd4ef61d2..950d80c499 100644 --- a/cpp/bench/random/make_blobs.cu +++ b/cpp/bench/random/make_blobs.cu @@ -25,6 +25,12 @@ struct make_blobs_inputs { bool row_major; }; // struct make_blobs_inputs +inline auto operator<<(std::ostream& os, const make_blobs_inputs& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.clusters << "#" << p.row_major; + return os; +} + template struct make_blobs : public fixture { make_blobs(const make_blobs_inputs& p) @@ -34,6 +40,10 @@ struct make_blobs : public fixture { void run_benchmark(::benchmark::State& state) override { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + loop_on_state(state, [this]() { raft::random::make_blobs(data.data(), labels.data(), diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh index 212245a9bf..68c2d56599 100644 --- a/cpp/include/raft/random/detail/make_blobs.cuh +++ b/cpp/include/raft/random/detail/make_blobs.cuh @@ -156,8 +156,10 @@ void generate_data(DataT* out, const DataT cluster_std_scalar, raft::random::RngState& rng_state) { - IdxT items = n_rows * n_cols; - IdxT nBlocks = (items + 127) / 128; + constexpr IdxT block_size = 128; + IdxT items = n_rows * n_cols; + // Choose a grid size so that each thread can write two output values. + IdxT nBlocks = ceildiv(items, 2 * block_size); // parentheses needed here for kernel, otherwise macro interprets the arguments // of triple chevron notation as macro arguments RAFT_CALL_RNG_FUNC(rng_state,