Skip to content

Commit

Permalink
Migrate set retrieve to use the OA implementation (#637)
Browse files Browse the repository at this point in the history
This PR updates the legacy set retrieve to use the new open-addressing
solution. It enhances open-addressing retrieve by eliminating the use of
coalesced groups to reduce register pressure, resulting in approximately
10% to 40% speedups in multiset retrieve benchmarks.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Daniel Jünger <[email protected]>
  • Loading branch information
3 people authored Nov 21, 2024
1 parent 644e553 commit d829576
Show file tree
Hide file tree
Showing 8 changed files with 432 additions and 571 deletions.
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ ConfigureBench(STATIC_SET_BENCH
static_set/contains_bench.cu
static_set/find_bench.cu
static_set/insert_bench.cu
static_set/retrieve_bench.cu
static_set/retrieve_all_bench.cu
static_set/size_bench.cu
static_set/rehash_bench.cu)
Expand Down
96 changes: 96 additions & 0 deletions benchmarks/static_set/retrieve_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark_defaults.hpp>
#include <benchmark_utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.cuh>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>
#include <thrust/transform.h>

using namespace cuco::benchmark;
using namespace cuco::utility;

/**
* @brief A benchmark evaluating `cuco::static_set::retrieve` performance
*/
template <typename Key, typename Dist>
void static_set_retrieve(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
auto const num_keys = state.get_int64("NumInputs");
auto const occupancy = state.get_float64("Occupancy");
auto const matching_rate = state.get_float64("MatchingRate");

std::size_t const size = num_keys / occupancy;

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

gen.dropout(keys.begin(), keys.end(), matching_rate);

state.add_element_count(num_keys);

cuco::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
set.insert(keys.begin(), keys.end());

auto const output_size = set.count(keys.begin(), keys.end());
thrust::device_vector<Key> output_match(output_size);
auto output_probe_begin = thrust::discard_iterator{};

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
set.retrieve(
keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()});
});
}

NVBENCH_BENCH_TYPES(static_set_retrieve,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::uniform>))
.set_name("static_set_retrieve_uniform_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::N})
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE)
.add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
.add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});

NVBENCH_BENCH_TYPES(static_set_retrieve,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::uniform>))
.set_name("static_set_retrieve_uniform_matching_rate")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::N})
.add_float64_axis("Occupancy", {defaults::OCCUPANCY})
.add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE)
.add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});

NVBENCH_BENCH_TYPES(static_set_retrieve,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::uniform>))
.set_name("static_set_retrieve_uniform_multiplicity")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::N})
.add_float64_axis("Occupancy", {defaults::OCCUPANCY})
.add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
.add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
141 changes: 70 additions & 71 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -415,77 +415,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
}
}

/**
* @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
* input_probe + n)`.
*
* If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
* `output_probe` and associated slot contents to `output_match`, respectively. The output order is
* unspecified.
*
* @tparam IsOuter Flag indicating whether it's an outer count or not
* @tparam block_size The size of the thread block
* @tparam InputProbeIt Device accessible input iterator
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
* convertible to the `InputProbeIt`'s `value_type`
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
* convertible to the container's `value_type`
* @tparam AtomicCounter Integral atomic type that follows the same semantics as
* `cuda::(std::)atomic(_ref)`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param input_probe Beginning of the sequence of input keys
* @param n Number of the keys to query
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
* `output_match`
* @param output_match Beginning of the sequence of matching elements
* @param atomic_counter Pointer to an atomic object of integral type that is used to count the
* number of output elements
* @param ref Non-owning container device ref used to access the slot storage
*/
template <bool IsOuter,
int32_t BlockSize,
class InputProbeIt,
class OutputProbeIt,
class OutputMatchIt,
class AtomicCounter,
class Ref>
CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe,
cuco::detail::index_type n,
OutputProbeIt output_probe,
OutputMatchIt output_match,
AtomicCounter* atomic_counter,
Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
// make sure all but the last block are always occupied
auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block;

auto const block_begin_offset = block.group_index().x * items_per_block;
auto const block_end_offset = min(n, block_begin_offset + items_per_block);

if (block_begin_offset < block_end_offset) {
if constexpr (IsOuter) {
ref.retrieve_outer<BlockSize>(block,
input_probe + block_begin_offset,
input_probe + block_end_offset,
output_probe,
output_match,
*atomic_counter);
} else {
ref.retrieve<BlockSize>(block,
input_probe + block_begin_offset,
input_probe + block_end_offset,
output_probe,
output_match,
*atomic_counter);
}
}
}

/**
* @brief Inserts all elements in the range `[first, last)`.
*
Expand Down Expand Up @@ -642,6 +571,76 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
}

/**
* @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
* input_probe + n)`.
*
* If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
* `output_probe` and associated slot contents to `output_match`, respectively. The output order is
* unspecified.
*
* @tparam IsOuter Flag indicating whether it's an outer count or not
* @tparam block_size The size of the thread block
* @tparam InputProbeIt Device accessible input iterator
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
* convertible to the `InputProbeIt`'s `value_type`
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
* convertible to the container's `value_type`
* @tparam AtomicCounter Integral atomic type that follows the same semantics as
* `cuda::(std::)atomic(_ref)`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param input_probe Beginning of the sequence of input keys
* @param n Number of the keys to query
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
* `output_match`
* @param output_match Beginning of the sequence of matching elements
* @param atomic_counter Pointer to an atomic object of integral type that is used to count the
* number of output elements
* @param ref Non-owning container device ref used to access the slot storage
*/
template <bool IsOuter,
int32_t BlockSize,
class InputProbeIt,
class OutputProbeIt,
class OutputMatchIt,
class AtomicCounter,
class Ref>
CUCO_KERNEL void retrieve(InputProbeIt input_probe,
cuco::detail::index_type n,
OutputProbeIt output_probe,
OutputMatchIt output_match,
AtomicCounter* atomic_counter,
Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto constexpr tiles_in_block = BlockSize / Ref::cg_size;

auto const block_begin_offset = block.group_index().x * tiles_in_block;
auto const block_end_offset =
min(n, static_cast<cuco::detail::index_type>(block_begin_offset + tiles_in_block));

if (block_begin_offset < block_end_offset) {
if constexpr (IsOuter) {
ref.retrieve_outer<BlockSize>(block,
input_probe + block_begin_offset,
input_probe + block_end_offset,
output_probe,
output_match,
atomic_counter);
} else {
ref.retrieve<BlockSize>(block,
input_probe + block_begin_offset,
input_probe + block_end_offset,
output_probe,
output_match,
atomic_counter);
}
}
}

/**
* @brief Calculates the number of filled slots for the given bucket storage.
*
Expand Down
Loading

0 comments on commit d829576

Please sign in to comment.