From 48754ca9d2c13d680917ebe39bfa7c4927aa67a7 Mon Sep 17 00:00:00 2001 From: hrong Date: Thu, 30 May 2024 14:56:37 -0700 Subject: [PATCH] [Opt] Expose the `detail::popc` as public API --- cpp/bench/prims/CMakeLists.txt | 8 +- cpp/bench/prims/core/popc.cu | 127 ++++++++++++++++++++ cpp/include/raft/core/bitset.cuh | 5 +- cpp/include/raft/core/detail/popc.cuh | 11 +- cpp/include/raft/core/popc.hpp | 41 +++++++ cpp/test/CMakeLists.txt | 1 + cpp/test/core/popc.cu | 159 ++++++++++++++++++++++++++ 7 files changed, 344 insertions(+), 8 deletions(-) create mode 100644 cpp/bench/prims/core/popc.cu create mode 100644 cpp/include/raft/core/popc.hpp create mode 100644 cpp/test/core/popc.cu diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 0771a60e58..c72649e350 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -75,7 +75,13 @@ endfunction() if(BUILD_PRIMS_BENCH) ConfigureBench( - NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp + NAME + CORE_BENCH + PATH + core/bitset.cu + core/copy.cu + core/popc.cu + main.cpp ) ConfigureBench( diff --git a/cpp/bench/prims/core/popc.cu b/cpp/bench/prims/core/popc.cu new file mode 100644 index 0000000000..dfa4335140 --- /dev/null +++ b/cpp/bench/prims/core/popc.cu @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace raft::bench::core { + +template +struct PopcInputs { + index_t n_rows; + index_t n_cols; + float sparsity; +}; + +template +inline auto operator<<(std::ostream& os, const PopcInputs& params) -> std::ostream& +{ + os << params.n_rows << "#" << params.n_cols << "#" << params.sparsity; + return os; +} + +template +struct popc_bench : public fixture { + popc_bench(const PopcInputs& p) + : params(p), + n_element(raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8))), + bits_d{raft::make_device_vector(res, n_element)}, + nnz_actual_d{raft::make_device_scalar(res, 0)} + { + } + + index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bits_t& element = bitmap[index / (8 * sizeof(bits_t))]; + index_t bit_position = index % (8 * sizeof(bits_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + std::vector bits_h(n_element); + auto stream = raft::resource::get_cuda_stream(res); + + create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h); + update_device(bits_d.data_handle(), bits_h.data(), bits_h.size(), stream); + + resource::sync_stream(res); + + loop_on_state(state, [this]() { + auto bits_view = + raft::make_device_vector_view(bits_d.data_handle(), bits_d.size()); + + index_t max_len = params.n_rows * params.n_cols; + auto max_len_view = raft::make_host_scalar_view(&max_len); + auto nnz_actual_view = + nnz_actual_d.view(); // raft::make_device_scalar_view(nnz_actual_d.data_handle()); + raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view); + }); + } + + private: + raft::resources res; + PopcInputs params; + index_t n_element; + + raft::device_vector bits_d; + raft::device_scalar nnz_actual_d; +}; + +template +const std::vector> popc_input_vecs{ + {2, 131072, 0.4}, {8, 131072, 0.5}, {16, 131072, 0.2}, {2, 8192, 0.4}, {16, 8192, 0.5}, + {128, 8192, 0.2}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, + + {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, + + {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, + {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, + {1024, 8192, 0.5}, {1024, 8192, 0.2}, + + {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, + {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}}; + +using PopcBenchI64 = popc_bench; + +RAFT_BENCH_REGISTER(PopcBenchI64, "", popc_input_vecs); + +} // namespace raft::bench::core diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh index d7eedee92e..3b67e56eea 100644 --- a/cpp/include/raft/core/bitset.cuh +++ b/cpp/include/raft/core/bitset.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include #include #include +#include #include #include #include @@ -167,9 +167,10 @@ template void bitset::count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) { + auto max_len = raft::make_host_scalar_view(&bitset_len_); auto values = raft::make_device_vector_view(bitset_.data(), n_elements()); - raft::detail::popc(res, values, bitset_len_, count_gpu_scalar); + raft::popc(res, values, max_len, count_gpu_scalar); } } // end namespace raft::core diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/core/detail/popc.cuh index d74b68b715..20b4814216 100644 --- a/cpp/include/raft/core/detail/popc.cuh +++ b/cpp/include/raft/core/detail/popc.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -28,15 +29,15 @@ namespace raft::detail { * @tparam value_t the value type of the vector. * @tparam index_t the index type of vector and scalar. * - * @param[in] res raft handle for managing expensive resources - * @param[in] values Number of row in the matrix. + * @param[in] res RAFT handle for managing expensive resources + * @param[in] values Device vector view containing the values to be processed. * @param[in] max_len Maximum number of bits to count. - * @param[out] counter Number of bits that are set to 1. + * @param[out] counter Device scalar view to store the number of bits that are set to 1. */ template void popc(const raft::resources& res, device_vector_view values, - index_t max_len, + raft::host_scalar_view max_len, raft::device_scalar_view counter) { auto values_size = values.size(); @@ -46,7 +47,7 @@ void popc(const raft::resources& res, static constexpr index_t len_per_item = sizeof(value_t) * 8; - value_t tail_len = (max_len % len_per_item); + value_t tail_len = (max_len[0] % len_per_item); value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0}; raft::linalg::coalesced_reduction( res, diff --git a/cpp/include/raft/core/popc.hpp b/cpp/include/raft/core/popc.hpp new file mode 100644 index 0000000000..fc6b6bd177 --- /dev/null +++ b/cpp/include/raft/core/popc.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +/** + * @brief Count the number of bits that are set to 1 in a vector. + * + * @tparam value_t the value type of the vector. + * @tparam index_t the index type of vector and scalar. + * + * @param[in] res RAFT handle for managing expensive resources + * @param[in] values Device vector view containing the values to be processed. + * @param[in] max_len Host scalar view to store the Maximum number of bits to count. + * @param[out] counter Device scalar view to store the number of bits that are set to 1. + */ +template +void popc(const raft::resources& res, + device_vector_view values, + raft::host_scalar_view max_len, + raft::device_scalar_view counter) +{ + detail::popc(res, values, max_len, counter); +} + +} // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index ff0518a4d0..88ce8a8263 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -121,6 +121,7 @@ if(BUILD_TESTS) core/math_host.cpp core/operators_device.cu core/operators_host.cpp + core/popc.cu core/handle.cpp core/interruptible.cu core/nvtx.cpp diff --git a/cpp/test/core/popc.cu b/cpp/test/core/popc.cu new file mode 100644 index 0000000000..83dda79b6e --- /dev/null +++ b/cpp/test/core/popc.cu @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +template +struct PopcInputs { + index_t n_rows; + index_t n_cols; + float sparsity; + bool owning; +}; + +template +class PopcTest : public ::testing::TestWithParam> { + public: + PopcTest() + : stream(resource::get_cuda_stream(handle)), + params(::testing::TestWithParam>::GetParam()), + bits_d(0, stream) + { + } + + protected: + index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bits_t& element = bitmap[index / (8 * sizeof(bits_t))]; + index_t bit_position = index % (8 * sizeof(bits_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + + void SetUp() override + { + index_t element = raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8)); + std::vector bits_h(element); + + nnz_expected = create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h); + bits_d.resize(bits_h.size(), stream); + update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream); + + resource::sync_stream(handle); + } + + void Run() + { + auto bits_view = + raft::make_device_vector_view(bits_d.data(), bits_d.size()); + + index_t max_len = params.n_rows * params.n_cols; + auto max_len_view = raft::make_host_scalar_view(&max_len); + + index_t nnz_actual_h = 0; + rmm::device_scalar nnz_actual_d(0, stream); + auto nnz_actual_view = raft::make_device_scalar_view(nnz_actual_d.data()); + + raft::popc(handle, bits_view, max_len_view, nnz_actual_view); + raft::copy(&nnz_actual_h, nnz_actual_d.data(), 1, stream); + resource::sync_stream(handle); + + ASSERT_EQ(nnz_expected, nnz_actual_h); + } + + protected: + raft::resources handle; + cudaStream_t stream; + + PopcInputs params; + rmm::device_uvector bits_d; + index_t nnz_expected; +}; + +using PopcTestI32 = PopcTest; +TEST_P(PopcTestI32, Result) { Run(); } + +template +const std::vector> popc_inputs = { + {0, 0, 0.2}, + {10, 32, 0.4}, + {10, 3, 0.2}, + {32, 1024, 0.4}, + {1024, 1048576, 0.01}, + {1024, 1024, 0.4}, + {64 * 1024 + 10, 2, 0.3}, + {16, 16, 0.3}, + {17, 16, 0.3}, + {18, 16, 0.3}, + {32 + 9, 33, 0.2}, + {2, 33, 0.2}, + {0, 0, 0.2}, + {10, 32, 0.4}, + {10, 3, 0.2}, + {32, 1024, 0.4}, + {1024, 1048576, 0.01}, + {1024, 1024, 0.4}, + {64 * 1024 + 10, 2, 0.3}, + {16, 16, 0.3}, + {17, 16, 0.3}, + {18, 16, 0.3}, + {32 + 9, 33, 0.2}, + {2, 33, 0.2}, +}; + +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs)); + +} // namespace raft