Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add experimental static_map #314

Merged
merged 56 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from 53 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
91e621c
Clean up static_set with a common open_addressing_base class
PointKernel Jun 4, 2023
d347995
Use PIMPL instead of inheritance
PointKernel Jun 4, 2023
474b110
Add experimental static_map class
PointKernel Jun 4, 2023
1fe9468
Minor cleanup
PointKernel Jun 4, 2023
31fcc88
Add empty value sentinel getter
PointKernel Jun 5, 2023
7b6e791
Move retrieve_all detail to the common open_addressing_impl class
PointKernel Jun 5, 2023
ffeaf75
Update docs
PointKernel Jun 5, 2023
90546fe
Move insert details to open_addressing_impl for static_set
PointKernel Jun 5, 2023
af8fb7c
Update docs
PointKernel Jun 5, 2023
55bd286
Move static_set contains details to the common open_addressing_impl c…
PointKernel Jun 5, 2023
2a3cc2f
Clean up static_set find with common open_addressing_impl class
PointKernel Jun 5, 2023
6d1c3f6
Clean up static map host functions with open_addressing_impl
PointKernel Jun 5, 2023
b220fc9
Merge remote-tracking branch 'upstream/dev' into new-static-map-v2
PointKernel Jun 6, 2023
f0bf8b7
Add open_addressing_ref_impl class
PointKernel Jun 6, 2023
2d68252
Move static_set_ref find and contains to the common open_addressing_r…
PointKernel Jun 6, 2023
39cb6ce
Clean up static_set_ref insert functions with open_addressing_ref_impl
PointKernel Jun 6, 2023
8bb5e3a
Clean up equal wrapper
PointKernel Jun 6, 2023
e402d4a
Fix logic issues with slot sentinel
PointKernel Jun 6, 2023
478bc2f
Clean up static_map_ref with open_addressing_ref_impl
PointKernel Jun 6, 2023
b5d5ef3
Update docs
PointKernel Jun 6, 2023
8961718
Add pair equal operator
PointKernel Jun 7, 2023
0afb43d
Remove static_set::find benchmark
PointKernel Jun 7, 2023
25d1255
Move host bulk find out of common headers
PointKernel Jun 7, 2023
2f0a506
Remove host bulk static_set::find
PointKernel Jun 7, 2023
86167a5
Add tests for experimental static_map + fix various minor issues
PointKernel Jun 7, 2023
bf3f91c
Minor doc fix
PointKernel Jun 7, 2023
02fbdb1
Add static_map kernel file
PointKernel Jun 7, 2023
a835efb
Update static_map::find docs
PointKernel Jun 8, 2023
5a3f030
Add static_set::find back
PointKernel Jun 8, 2023
1824aad
Add set find kernel
PointKernel Jun 8, 2023
84257ae
Add static_set::find benchmark
PointKernel Jun 8, 2023
f378a6d
Clean up size and retrieve_all implementations
PointKernel Jun 8, 2023
b19603b
Remove unused empty slot sentinel
PointKernel Jun 8, 2023
2adec67
Minor cleanups
PointKernel Jun 8, 2023
7797f2b
Make get_slot members constexpr and noexcept
PointKernel Jun 12, 2023
3a2b346
Minor kernel cleanups with auto const
PointKernel Jun 12, 2023
856c6fc
Clean up equal wrapper: noexcept, doc improvement
PointKernel Jun 12, 2023
397427a
static_map detail cleanups: fix docs, constexpr and noexcept when pos…
PointKernel Jun 12, 2023
68d7936
various static_set detail cleanups
PointKernel Jun 12, 2023
7be04e2
Update copyright year
PointKernel Jun 12, 2023
8978dbd
Updates: use struct instead of cuco pair for intermediate results, fi…
PointKernel Jun 12, 2023
58f2a74
Remove redundant allocator data member
PointKernel Jun 12, 2023
e19d5f8
Rename predicate as key_eq to align with STL naming convension
PointKernel Jun 12, 2023
fe9818f
Apply suggestions from code review
PointKernel Jun 13, 2023
f4f8369
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 13, 2023
37f4166
Rename impl data members
PointKernel Jun 13, 2023
b5b4d66
Change the default allocator types
PointKernel Jun 13, 2023
1cfa9dd
Minor doc updates
PointKernel Jun 13, 2023
484a72e
Add error check for non-CG device APIs to prevent misuse
PointKernel Jun 14, 2023
96ae321
Update error message
PointKernel Jun 14, 2023
f6dca60
Reorder header groups
PointKernel Jun 14, 2023
02c2584
Always pass CG by reference
PointKernel Jun 14, 2023
7492471
Clean up redundant header inclusions
PointKernel Jun 20, 2023
d06ce04
Make functor constructions explicit
PointKernel Jun 21, 2023
161dc7b
Clean up docs and comment
PointKernel Jun 21, 2023
be75468
Update static asserts
PointKernel Jun 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion examples/static_set/device_ref_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,11 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
int64_t const loop_stride = gridDim.x * blockDim.x;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

auto const tile =
cooperative_groups::tiled_partition<SetRef::cg_size>(cooperative_groups::this_thread_block());

while (idx < n) {
found[idx] = set.contains(*(keys + idx));
found[idx] = set.contains(tile, *(keys + idx));
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
idx += loop_stride;
}
}
Expand Down
55 changes: 55 additions & 0 deletions include/cuco/detail/common_functors.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
*/

#pragma once

namespace cuco {
namespace experimental {
namespace detail {

/**
* @brief Device functor returning the content of the slot indexed by `idx`.
*
* @tparam StorageRef Storage ref type
*/
template <typename StorageRef>
struct get_slot {
StorageRef storage_; ///< Storage ref

/**
* @brief Constructs `get_slot` functor with the given storage ref.
*
* @param s Input storage ref
*/
constexpr get_slot(StorageRef s) noexcept : storage_{s} {}
PointKernel marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Accesses the slot content with the given index.
*
* @param idx The slot index
* @return The slot content
*/
__device__ constexpr typename StorageRef::value_type operator()(
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
typename StorageRef::size_type idx) const noexcept
{
auto const window_idx = idx / StorageRef::window_size;
auto const intra_idx = idx % StorageRef::window_size;
return storage_[window_idx][intra_idx];
}
};

} // namespace detail
} // namespace experimental
} // namespace cuco
265 changes: 265 additions & 0 deletions include/cuco/detail/common_kernels.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cuco/detail/utils.hpp>

#include <cub/block/block_reduce.cuh>

#include <cuda/atomic>

#include <cooperative_groups.h>

namespace cuco {
namespace experimental {
namespace detail {

/**
* @brief Inserts all elements in the range `[first, first + n)` and returns the number of
* successful insertions if `pred` of the corresponding stencil returns true.
*
* @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
* is inserted.
* @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIterator Device accessible input iterator whose `value_type` is
* convertible to the `value_type` of the data structure
* @tparam StencilIt Device accessible random access iterator whose value_type is
* convertible to Predicate's argument type
* @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
* and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
* @tparam AtomicT Atomic counter type
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of input elements
* @param n Number of input elements
* @param stencil Beginning of the stencil sequence
* @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
* @param num_successes Number of successful inserted elements
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize,
int32_t BlockSize,
typename InputIterator,
typename StencilIt,
typename Predicate,
typename AtomicT,
typename Ref>
__global__ void insert_if_n(InputIterator first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
AtomicT* num_successes,
Ref ref)
{
using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage;
typename Ref::size_type thread_num_successes = 0;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
typename Ref::value_type const insert_pair{*(first + idx)};
if constexpr (CGSize == 1) {
if (ref.insert(insert_pair)) { thread_num_successes++; };
} else {
auto const tile =
cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; };
}
}
idx += loop_stride;
}

// compute number of successfully inserted elements for each block
// and atomically add to the grand total
auto const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
if (threadIdx.x == 0) {
num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
}
}

/**
* @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding
* stencil returns true.
*
* @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
* is inserted.
* @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIterator Device accessible input iterator whose `value_type` is
* convertible to the `value_type` of the data structure
* @tparam StencilIt Device accessible random access iterator whose value_type is
* convertible to Predicate's argument type
* @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
* and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of input elements
* @param n Number of input elements
* @param stencil Beginning of the stencil sequence
* @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize,
int32_t BlockSize,
typename InputIterator,
typename StencilIt,
typename Predicate,
typename Ref>
__global__ void insert_if_n(
InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
typename Ref::value_type const insert_pair{*(first + idx)};
if constexpr (CGSize == 1) {
ref.insert(insert_pair);
} else {
auto const tile =
cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
ref.insert(tile, insert_pair);
}
}
idx += loop_stride;
}
}

/**
* @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
* structure if `pred` of the corresponding stencil returns true.
*
* @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
* indicating if the key `*(first + i)` is present in the container. If `pred( *(stencil + i) )` is
* false, stores false to `(output_begin + i)`.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize The size of the thread block
* @tparam InputIt Device accessible input iterator
* @tparam StencilIt Device accessible random access iterator whose value_type is
* convertible to Predicate's argument type
* @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
* and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
* @tparam OutputIt Device accessible output iterator assignable from `bool`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param n Number of keys
* @param stencil Beginning of the stencil sequence
* @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
* @param output_begin Beginning of the sequence of booleans for the presence of each key
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize,
int32_t BlockSize,
typename InputIt,
typename StencilIt,
typename Predicate,
typename OutputIt,
typename Ref>
__global__ void contains_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
OutputIt output_begin,
Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;

__shared__ bool output_buffer[BlockSize / CGSize];

while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
if constexpr (CGSize == 1) {
if (idx < n) {
auto const key = *(first + idx);
/*
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
* sector stores from L2 to global memory. By writing results to shared memory and then
* synchronizing before writing back to global, we no longer rely on L1, preventing the
* increase in sector stores from L2 to global and improving performance.
*/
output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false;
}
block.sync();
if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; }
} else {
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
if (idx < n) {
auto const key = *(first + idx);
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
}
}
idx += loop_stride;
}
}

/**
* @brief Calculates the number of filled slots for the given window storage.
*
* @tparam BlockSize Number of threads in each block
* @tparam StorageRef Type of non-owning ref allowing access to storage
* @tparam Predicate Type of predicate indicating if the given slot is filled
* @tparam AtomicT Atomic counter type
*
* @param storage Non-owning device ref used to access the slot storage
* @param is_filled Predicate indicating if the given slot is filled
* @param count Number of filled slots
*/
template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
{
using size_type = typename StorageRef::size_type;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x;

size_type thread_count = 0;
auto const n = storage.num_windows();

while (idx < n) {
auto const window = storage[idx];
#pragma unroll
for (auto const& it : window) {
thread_count += static_cast<size_type>(is_filled(it));
}
idx += loop_stride;
}

using BlockReduce = cub::BlockReduce<size_type, BlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage;
auto const block_count = BlockReduce(temp_storage).Sum(thread_count);
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
}

} // namespace detail
} // namespace experimental
} // namespace cuco
Loading