From 93a20b82011340a8ab6a5bdd1c0083d1c1d1414d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 27 Oct 2022 19:06:11 +0200 Subject: [PATCH] Add fusedL2NN benchmark (#936) Adds a benchmark for `fusedL2NN`. The benchmark used the wrapper `fusedL2NNMinReduce` compiled in the distance library, for faster compilation times. Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/936 --- cpp/bench/CMakeLists.txt | 2 +- cpp/bench/distance/fused_l2_nn.cu | 153 ++++++++++++++++++ cpp/bench/neighbors/fused_l2_nn.cu | 123 -------------- cpp/include/raft/linalg/contractions.cuh | 2 - cpp/include/raft/random/detail/rng_device.cuh | 4 +- 5 files changed, 156 insertions(+), 128 deletions(-) create mode 100644 cpp/bench/distance/fused_l2_nn.cu delete mode 100644 cpp/bench/neighbors/fused_l2_nn.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index e0f42d1803..ef91fe4e6c 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -85,6 +85,7 @@ if(BUILD_BENCH) bench/distance/distance_exp_l2.cu bench/distance/distance_l1.cu bench/distance/distance_unexp_l2.cu + bench/distance/fused_l2_nn.cu bench/distance/kernels.cu bench/main.cpp OPTIONAL DIST @@ -116,7 +117,6 @@ if(BUILD_BENCH) ConfigureBench(NAME NEIGHBORS_BENCH PATH - bench/neighbors/fused_l2_nn.cu bench/neighbors/knn/brute_force_float_int64_t.cu bench/neighbors/knn/brute_force_float_uint32_t.cu bench/neighbors/knn/ivf_flat_float_int64_t.cu diff --git a/cpp/bench/distance/fused_l2_nn.cu b/cpp/bench/distance/fused_l2_nn.cu new file mode 100644 index 0000000000..48473b2846 --- /dev/null +++ b/cpp/bench/distance/fused_l2_nn.cu @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#if defined RAFT_DISTANCE_COMPILED +#include +#endif +#include + +namespace raft::bench::distance { + +struct fusedl2nn_inputs { + int64_t m, n, k; +}; // struct fusedl2nn_inputs + +inline auto operator<<(std::ostream& os, const fusedl2nn_inputs& p) -> std::ostream& +{ + os << p.m << "#" << p.n << "#" << p.k; + return os; +} + +template +struct fusedl2nn : public fixture { + fusedl2nn(const fusedl2nn_inputs& p) : params(p) {} + + void allocate_data(const ::benchmark::State& state) override + { + x = raft::make_device_matrix(handle, params.m, params.k); + y = raft::make_device_matrix(handle, params.n, params.k); + x_norm = raft::make_device_vector(handle, params.m); + y_norm = raft::make_device_vector(handle, params.n); + out = raft::make_device_vector(handle, params.m); + + raft::random::RngState rng{1234}; + raft::random::uniform( + handle, rng, x.data_handle(), params.m * params.k, (DataT)-1.0, (DataT)1.0); + raft::random::uniform( + handle, rng, y.data_handle(), params.n * params.k, (DataT)-1.0, (DataT)1.0); + + // Pre-compute norms + raft::linalg::rowNorm(x_norm.data_handle(), + x.data_handle(), + params.k, + params.m, + raft::linalg::L2Norm, + true, + stream); + raft::linalg::rowNorm(y_norm.data_handle(), + y.data_handle(), + params.k, + params.n, + raft::linalg::L2Norm, + true, + stream); + handle.sync_stream(stream); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + workspace = raft::make_device_vector(handle, params.m * sizeof(IdxT)); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + raft::distance::fusedL2NNMinReduce(out.data_handle(), + x.data_handle(), + y.data_handle(), + x_norm.data_handle(), + y_norm.data_handle(), + static_cast(params.m), + static_cast(params.n), + static_cast(params.k), + (void*)workspace.data_handle(), + false, + true, + stream); + }); + + int64_t num_flops = 2 * params.m * params.n * params.k; + + int64_t read_elts = params.n * params.k + params.m * params.k; + int64_t write_elts = params.m; + + state.counters["FLOP/s"] = benchmark::Counter( + num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); + + state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + } + + private: + fusedl2nn_inputs params; + raft::device_matrix x, y; + raft::device_vector x_norm, y_norm; + raft::device_vector out; + raft::device_vector workspace; +}; // struct fusedl2nn + +template +std::vector getFusedL2NNInputs() +{ + std::vector inputs; + std::vector m_list = {100000, 1000000}; + if constexpr (sizeof(IdxT) == 8) { m_list.push_back(10000000); } + std::vector n_list = {100, 1000, 10000}; + std::vector k_list = {64, 128, 256}; + for (auto m : m_list) { + for (auto n : n_list) { + for (auto k : k_list) { + inputs.push_back({m, n, k}); + } + } + } + return inputs; +} + +#define FUSEDL2NN_BENCH(DataT, IdxT, OutT) \ + RAFT_BENCH_REGISTER((fusedl2nn), "", getFusedL2NNInputs()) + +FUSEDL2NN_BENCH(float, int, float); +FUSEDL2NN_BENCH(double, int, double); +FUSEDL2NN_BENCH(float, int, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(double, int, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(float, int64_t, float); +FUSEDL2NN_BENCH(double, int64_t, double); +FUSEDL2NN_BENCH(float, int64_t, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(double, int64_t, (raft::KeyValuePair)); + +} // namespace raft::bench::distance diff --git a/cpp/bench/neighbors/fused_l2_nn.cu b/cpp/bench/neighbors/fused_l2_nn.cu deleted file mode 100644 index 402d5c637a..0000000000 --- a/cpp/bench/neighbors/fused_l2_nn.cu +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#if defined RAFT_DISTANCE_COMPILED -#include -#endif - -namespace raft::bench::spatial { - -struct fused_l2_nn_inputs { - int m, n, k; -}; // struct fused_l2_nn_inputs - -template -struct fused_l2_nn : public fixture { - fused_l2_nn(const fused_l2_nn_inputs& p) - : params(p), - out(p.m, stream), - x(p.m * p.k, stream), - y(p.n * p.k, stream), - xn(p.m, stream), - yn(p.n, stream), - workspace(p.m, stream) - { - raft::handle_t handle{stream}; - raft::random::RngState r(123456ULL); - - uniform(handle, r, x.data(), p.m * p.k, T(-1.0), T(1.0)); - uniform(handle, r, y.data(), p.n * p.k, T(-1.0), T(1.0)); - raft::linalg::rowNorm(xn.data(), x.data(), p.k, p.m, raft::linalg::L2Norm, true, stream); - raft::linalg::rowNorm(yn.data(), y.data(), p.k, p.n, raft::linalg::L2Norm, true, stream); - raft::distance::initialize, int>( - handle, out.data(), p.m, std::numeric_limits::max(), op); - } - - void run_benchmark(::benchmark::State& state) override - { - loop_on_state(state, [this]() { - // it is enough to only benchmark the L2-squared metric - raft::distance::fusedL2NN, int>(out.data(), - x.data(), - y.data(), - xn.data(), - yn.data(), - params.m, - params.n, - params.k, - (void*)workspace.data(), - op, - pairRedOp, - false, - false, - stream); - }); - - // Num distance calculations - int64_t num_dist_calcs = (int64_t)params.n * (int64_t)params.m; - - int64_t num_flops = 3 * num_dist_calcs * params.k; - - int64_t read_elts = (int64_t)params.n * params.k + (int64_t)params.m * params.k; - int64_t write_elts = (int64_t)params.n; - - state.counters["D/s"] = benchmark::Counter(num_dist_calcs, - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - - state.counters["FLOP/s"] = benchmark::Counter( - num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); - - state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(raft::KeyValuePair), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(float), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - - state.counters["K"] = benchmark::Counter(params.k); - } - - private: - fused_l2_nn_inputs params; - rmm::device_uvector x, y, xn, yn; - rmm::device_uvector> out; - rmm::device_uvector workspace; - raft::distance::KVPMinReduce pairRedOp; - raft::distance::MinAndDistanceReduceOp op; -}; // struct FusedL2NN - -const std::vector fused_l2_nn_input_vecs = { - {32, 16384, 16384}, {64, 16384, 16384}, {128, 16384, 16384}, {256, 16384, 16384}, - {512, 16384, 16384}, {1024, 16384, 16384}, {16384, 32, 16384}, {16384, 64, 16384}, - {16384, 128, 16384}, {16384, 256, 16384}, {16384, 512, 16384}, {16384, 1024, 16384}, - {16384, 16384, 2}, {16384, 16384, 4}, {16384, 16384, 8}, {16384, 16384, 16}, - {16384, 16384, 32}, {16384, 16384, 64}, {16384, 16384, 128}, {16384, 16384, 256}, - {16384, 16384, 512}, {16384, 16384, 1024}, {16384, 16384, 16384}, -}; - -RAFT_BENCH_REGISTER(fused_l2_nn, "", fused_l2_nn_input_vecs); -RAFT_BENCH_REGISTER(fused_l2_nn, "", fused_l2_nn_input_vecs); - -} // namespace raft::bench::spatial diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index 8aed0cb4be..4321e13d95 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -200,14 +200,12 @@ struct Policy2x8 { template struct Policy2x8 { typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; }; template struct Policy2x8 { // this is not used just for keeping compiler happy. typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; }; /** @} */ diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh index 8f0bf9fe53..ef13138beb 100644 --- a/cpp/include/raft/random/detail/rng_device.cuh +++ b/cpp/include/raft/random/detail/rng_device.cuh @@ -667,7 +667,7 @@ __global__ void rngKernel(DeviceState rng_state, LenType len, ParamType params) { - LenType tid = threadIdx.x + blockIdx.x * blockDim.x; + LenType tid = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; GenType gen(rng_state, (uint64_t)tid); const LenType stride = gridDim.x * blockDim.x; for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) { @@ -692,7 +692,7 @@ template (blockIdx.x) * blockDim.x; GenType gen(seed, adv_subs + (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) {