Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Opt] Expose the detail::popc as public API #2346

Merged
merged 4 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cpp/bench/prims/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,13 @@ endfunction()

if(BUILD_PRIMS_BENCH)
ConfigureBench(
NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp
NAME
CORE_BENCH
PATH
core/bitset.cu
core/copy.cu
core/popc.cu
main.cpp
)

ConfigureBench(
Expand Down
127 changes: 127 additions & 0 deletions cpp/bench/prims/core/popc.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <common/benchmark.hpp>

#include <raft/core/popc.hpp>

namespace raft::bench::core {

template <typename index_t>
struct PopcInputs {
index_t n_rows;
index_t n_cols;
float sparsity;
};

template <typename index_t>
inline auto operator<<(std::ostream& os, const PopcInputs<index_t>& params) -> std::ostream&
{
os << params.n_rows << "#" << params.n_cols << "#" << params.sparsity;
return os;
}

template <typename index_t, typename bits_t = uint32_t>
struct popc_bench : public fixture {
popc_bench(const PopcInputs<index_t>& p)
: params(p),
n_element(raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8))),
bits_d{raft::make_device_vector<bits_t, index_t>(res, n_element)},
nnz_actual_d{raft::make_device_scalar<index_t>(res, 0)}
{
}

index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector<bits_t>& bitmap)
{
index_t total = static_cast<index_t>(m * n);
index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
index_t res = num_ones;

for (auto& item : bitmap) {
item = static_cast<bits_t>(0);
}

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<index_t> dis(0, total - 1);

while (num_ones > 0) {
index_t index = dis(gen);

bits_t& element = bitmap[index / (8 * sizeof(bits_t))];
index_t bit_position = index % (8 * sizeof(bits_t));

if (((element >> bit_position) & 1) == 0) {
element |= (static_cast<index_t>(1) << bit_position);
num_ones--;
}
}
return res;
}
void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

std::vector<bits_t> bits_h(n_element);
auto stream = raft::resource::get_cuda_stream(res);

create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h);
update_device(bits_d.data_handle(), bits_h.data(), bits_h.size(), stream);

resource::sync_stream(res);

loop_on_state(state, [this]() {
auto bits_view =
raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());

index_t max_len = params.n_rows * params.n_cols;
auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
auto nnz_actual_view =
nnz_actual_d.view(); // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
});
}

private:
raft::resources res;
PopcInputs<index_t> params;
index_t n_element;

raft::device_vector<bits_t, index_t> bits_d;
raft::device_scalar<index_t> nnz_actual_d;
};

template <typename index_t>
const std::vector<PopcInputs<index_t>> popc_input_vecs{
{2, 131072, 0.4}, {8, 131072, 0.5}, {16, 131072, 0.2}, {2, 8192, 0.4}, {16, 8192, 0.5},
{128, 8192, 0.2}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1},

{1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1},

{1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5},
{1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4},
{1024, 8192, 0.5}, {1024, 8192, 0.2},

{1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2},
{1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}};

using PopcBenchI64 = popc_bench<int64_t>;

RAFT_BENCH_REGISTER(PopcBenchI64, "", popc_input_vecs<int64_t>);

} // namespace raft::bench::core
5 changes: 3 additions & 2 deletions cpp/include/raft/core/bitset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#pragma once

#include <raft/core/bitset.hpp>
#include <raft/core/detail/popc.cuh>
#include <raft/core/device_container_policy.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/popc.hpp>
#include <raft/core/resource/thrust_policy.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/map.cuh>
Expand Down Expand Up @@ -167,9 +167,10 @@ template <typename bitset_t, typename index_t>
void bitset<bitset_t, index_t>::count(const raft::resources& res,
raft::device_scalar_view<index_t> count_gpu_scalar)
{
auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
auto values =
raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
raft::detail::popc(res, values, bitset_len_, count_gpu_scalar);
raft::popc(res, values, max_len, count_gpu_scalar);
}

} // end namespace raft::core
11 changes: 6 additions & 5 deletions cpp/include/raft/core/detail/popc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <raft/core/detail/mdspan_util.cuh>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/host_mdspan.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/coalesced_reduction.cuh>

Expand All @@ -28,15 +29,15 @@ namespace raft::detail {
* @tparam value_t the value type of the vector.
* @tparam index_t the index type of vector and scalar.
*
* @param[in] res raft handle for managing expensive resources
* @param[in] values Number of row in the matrix.
* @param[in] res RAFT handle for managing expensive resources
* @param[in] values Device vector view containing the values to be processed.
* @param[in] max_len Maximum number of bits to count.
* @param[out] counter Number of bits that are set to 1.
* @param[out] counter Device scalar view to store the number of bits that are set to 1.
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
device_vector_view<value_t, index_t> values,
index_t max_len,
raft::host_scalar_view<index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
auto values_size = values.size();
Expand All @@ -46,7 +47,7 @@ void popc(const raft::resources& res,

static constexpr index_t len_per_item = sizeof(value_t) * 8;

value_t tail_len = (max_len % len_per_item);
value_t tail_len = (max_len[0] % len_per_item);
value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0};
raft::linalg::coalesced_reduction(
res,
Expand Down
41 changes: 41 additions & 0 deletions cpp/include/raft/core/popc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once
#include <raft/core/detail/popc.cuh>
namespace raft {

/**
* @brief Count the number of bits that are set to 1 in a vector.
*
* @tparam value_t the value type of the vector.
* @tparam index_t the index type of vector and scalar.
*
* @param[in] res RAFT handle for managing expensive resources
* @param[in] values Device vector view containing the values to be processed.
* @param[in] max_len Host scalar view to store the Maximum number of bits to count.
* @param[out] counter Device scalar view to store the number of bits that are set to 1.
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be quite honest, I'm not sure this is something that belongs in raft::core.

Also, I'm a little confused by the max_len- can you explain why this additional argument is needed? This API seems a little implementation-specific, which makes me wonder if it should be more strongly coupled to the bitset/bitmap.

Copy link
Member Author

@rhdong rhdong May 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original wrap of popc is here: https://github.com/rapidsai/raft/blob/branch-24.08/cpp/include/raft/core/detail/mdspan_util.cuh#L48. But it can only calc nonzero bits in one int32/int64, while this one can calculate on an integer array. Would you have some recommendations on the namespace? Maybe the raft::linalg is another choice.

The max_len should be required to indicate the max bits to be processed. For example, we have 100 bits needed to be calculated and at least 4 uint32_ts to present them(get 128 bits physically). In this situation, a max_len or similar parameter is needed to tell the popc to stop on the right bits.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This API is mainly for clean code. The detail::popc includes at least 10 lines of code because it has to process the corner cases. If we were using the original code, we would have to copy that block of code anywhere, that is ugly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is a computational function, I don't think it belongs in raft/core. I question whether or not it actually belongs in raft/matrix or in raft/utils, though. Hmm.

Copy link
Member Author

@rhdong rhdong Jul 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer raft/utils as it seems more reasonable. If you agree, I will move it there. @cjnolet

device_vector_view<value_t, index_t> values,
raft::host_scalar_view<index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
detail::popc(res, values, max_len, counter);
}

} // namespace raft
1 change: 1 addition & 0 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ if(BUILD_TESTS)
core/math_host.cpp
core/operators_device.cu
core/operators_host.cpp
core/popc.cu
core/handle.cpp
core/interruptible.cu
core/nvtx.cpp
Expand Down
Loading
Loading