Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rehash functionality #380

Merged
merged 14 commits into from
Oct 11, 2023
3 changes: 2 additions & 1 deletion benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ ConfigureBench(STATIC_SET_BENCH
hash_table/static_set/find_bench.cu
hash_table/static_set/insert_bench.cu
hash_table/static_set/retrieve_all_bench.cu
hash_table/static_set/size_bench.cu)
hash_table/static_set/size_bench.cu
hash_table/static_set/rehash_bench.cu)

###################################################################################################
# - static_map benchmarks -------------------------------------------------------------------------
Expand Down
60 changes: 60 additions & 0 deletions benchmarks/hash_table/static_set/rehash_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.hpp>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

/**
* @brief A benchmark evaluating `cuco::static_set::rehash` performance
*/
template <typename Key, typename Dist>
void static_set_rehash(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
std::size_t const capacity = state.get_int64_or_default("Capacity", cuco::benchmark::defaults::N);
auto const occupancy =
state.get_float64_or_default("Occupancy", cuco::benchmark::defaults::OCCUPANCY);

std::size_t const num_keys = capacity * occupancy;

thrust::device_vector<Key> keys(num_keys); // slots per second

cuco::utility::key_generator gen;
gen.generate(cuco::benchmark::dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(capacity);

cuco::experimental::static_set<Key> set{capacity, cuco::empty_key<Key>{-1}};

set.insert(keys.begin(), keys.end());

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { set.rehash({launch.get_stream()}); });
}

NVBENCH_BENCH_TYPES(static_set_rehash,
NVBENCH_TYPE_AXES(cuco::benchmark::defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::utility::distribution::unique>))
.set_name("static_set_rehash_unique_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
.add_float64_axis("Occupancy", cuco::benchmark::defaults::OCCUPANCY_RANGE);
47 changes: 47 additions & 0 deletions include/cuco/detail/common_kernels.cuh
sleeepyjack marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,53 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
}

template <int32_t BlockSize, typename ContainerRef, typename Predicate>
__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
ContainerRef container_ref,
Predicate is_filled)
{
namespace cg = cooperative_groups;

__shared__ typename ContainerRef::value_type buffer[BlockSize];
__shared__ unsigned int buffer_size;

auto constexpr cg_size = ContainerRef::cg_size;
auto const block = cg::this_thread_block();
auto const tile = cg::tiled_partition<cg_size>(block);

auto const thread_rank = block.thread_rank();
auto constexpr tiles_per_block = BlockSize / cg_size; // tile.meta_group_size() but constexpr
auto const tile_rank = tile.meta_group_rank();
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();
auto const n = storage_ref.num_windows();

while (idx - thread_rank < n) {
if (thread_rank == 0) { buffer_size = 0; }
block.sync();

// gather values in shmem buffer
if (idx < n) {
auto const window = storage_ref[idx];

for (auto const& slot : window) {
if (is_filled(slot)) { buffer[atomicAdd_block(&buffer_size, 1)] = slot; }
}
}
block.sync();

auto const local_buffer_size = buffer_size;

// insert from shmem buffer into the container
for (auto tidx = tile_rank; tidx < local_buffer_size; tidx += tiles_per_block) {
container_ref.insert(tile, buffer[tidx]);
}
block.sync();

idx += loop_stride;
}
}

} // namespace detail
} // namespace experimental
} // namespace cuco
115 changes: 115 additions & 0 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/extent.cuh>
#include <cuco/operator.hpp>
#include <cuco/probing_scheme.cuh>
#include <cuco/storage.cuh>
#include <cuco/utility/traits.hpp>
Expand Down Expand Up @@ -624,6 +625,120 @@ class open_addressing_impl {
return counter.load_to_host(stream);
}

/**
* @brief Regenerates the container
*
* @note This function synchronizes the given stream. For asynchronous execution use
* `rehash_async`.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
{
this->rehash_async(container, is_filled, stream);
stream.synchronize();
}

/**
* @brief Asynchronously reserves at least the specified number of slots and regenerates the
* container
*
* @note Changes the number of windows to a value that is not less than `extent`, then
* rehashes the container, i.e. puts the elements into appropriate slots considering
* that the total number of slots has changed.
*
* @note This function synchronizes the given stream. For asynchronous execution use
* `rehash_async`.
*
* @note Behavior is undefined if the desired `extent` is insufficient to store all of the
* contained elements.
*
* @note This function is not available if the conatiner's `extent_type` is static.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash(extent_type extent,
Container const& container,
Predicate const& is_filled,
cuda_stream_ref stream)
{
this->rehash_async(extent, container, is_filled, stream);
stream.synchronize();
}

/**
* @brief Asynchronously regenerates the container
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash_async(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
{
this->rehash_async(this->storage_.window_extent(), container, is_filled, stream);
}

/**
* @brief Asynchronously reserves at least the specified number of slots and regenerates the
* container
*
* @note Changes the number of windows to a value that is not less than `extent`, then
* rehashes the container, i.e. puts the elements into appropriate slots considering
* that the total number of slots has changed.
*
* @note Behavior is undefined if the desired `extent` is insufficient to store all of the
* contained elements.
*
* @note This function is not available if the conatiner's `extent_type` is static.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash_async(extent_type extent,
Container const& container,
Predicate const& is_filled,
cuda_stream_ref stream)
{
auto const old_storage = std::move(this->storage_);
new (&storage_) storage_type{extent, this->allocator()};
this->clear_async(stream);

auto const num_windows = old_storage.num_windows();
if (num_windows == 0) { return; }

auto constexpr block_size = cuco::detail::default_block_size();
auto constexpr stride = cuco::detail::default_stride();
auto const grid_size = cuco::detail::grid_size(num_windows, 1, stride, block_size);

detail::rehash<block_size><<<grid_size, block_size, 0, stream>>>(
old_storage.ref(), container.ref(op::insert), is_filled);
}

/**
* @brief Gets the maximum number of elements the container can hold.
*
Expand Down
66 changes: 66 additions & 0 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,72 @@ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
return std::make_pair(keys_out + num, values_out + num);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
this->impl_->rehash(*this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
auto const extent = make_window_extent<static_map>(capacity);
this->impl_->rehash(extent, *this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
this->impl_->rehash_async(*this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
auto const extent = make_window_extent<static_map>(capacity);
this->impl_->rehash_async(extent, *this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
Expand Down
62 changes: 62 additions & 0 deletions include/cuco/detail/static_set/static_set.inl
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,68 @@ OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
return impl_->retrieve_all(begin, output_begin, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
this->impl_->rehash(*this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
auto const extent = make_window_extent<static_set>(capacity);
this->impl_->rehash(extent, *this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
this->impl_->rehash_async(*this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
auto const extent = make_window_extent<static_set>(capacity);
this->impl_->rehash_async(extent, *this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
Expand Down
Loading