-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add device subsets example #346
Changes from 17 commits
f6f37fa
d35db89
482a14e
8e559e0
40b9b15
fda8f88
2c92f84
233c668
4eb25c9
b68761d
56b5dc3
eff6faa
871424a
635988b
393ee3b
70c3df7
085d1bb
755db26
8c746d7
cce72b4
b8028f4
d913720
02eabf6
3d016f6
5d88ea7
2433c09
c46d6fe
adc6a54
770a2ad
c632a9f
2fa3810
9e29ea7
8c8c3c1
2e7658c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cuco/static_set_ref.cuh> | ||
#include <cuco/storage.cuh> | ||
|
||
#include <thrust/device_vector.h> | ||
#include <thrust/reduce.h> | ||
#include <thrust/scan.h> | ||
|
||
#include <cooperative_groups.h> | ||
|
||
#include <cuda/std/array> | ||
|
||
#include <algorithm> | ||
#include <cstddef> | ||
#include <iostream> | ||
|
||
auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset | ||
auto constexpr window_size = 1; ///< TODO: how to explain window size (vector length) to users | ||
auto constexpr N = 10; ///< Number of elements to insert and query | ||
|
||
using key_type = int; | ||
using probing_scheme_type = | ||
cuco::experimental::linear_probing<cg_size, cuco::default_hash_function<key_type>>; | ||
using storage_type = cuco::experimental::aow_storage<key_type, window_size>; | ||
using storage_ref_type = typename storage_type::ref_type; | ||
template <typename Operator> | ||
using ref_type = cuco::experimental::static_set_ref<key_type, | ||
cuda::thread_scope_device, | ||
thrust::equal_to<key_type>, | ||
probing_scheme_type, | ||
storage_ref_type, | ||
Operator>; | ||
|
||
/// data to insert/query | ||
__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; | ||
/// Empty slots are represented by reserved "sentinel" values. These values should be selected such | ||
/// that they never occur in your input data. | ||
key_type constexpr empty_key_sentinel = -1; | ||
|
||
template <typename WindowT> | ||
__global__ void initialize(WindowT* windows, std::size_t n, typename WindowT::value_type value) | ||
{ | ||
using T = typename WindowT::value_type; | ||
|
||
auto const loop_stride = gridDim.x * blockDim.x; | ||
auto idx = blockDim.x * blockIdx.x + threadIdx.x; | ||
|
||
while (idx < n) { | ||
auto& window_slots = *(windows + idx); | ||
#pragma unroll | ||
for (auto& slot : window_slots) { | ||
new (&slot) T{value}; | ||
} | ||
idx += loop_stride; | ||
} | ||
} | ||
|
||
// insert a set of keys into a hash set using one cooperative group for each task | ||
template <typename Window, typename Size, typename Offset> | ||
__global__ void insert(Window* windows, Size* sizes, Offset* offsets) | ||
{ | ||
namespace cg = cooperative_groups; | ||
|
||
auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block()); | ||
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; | ||
|
||
auto set_ref = ref_type<cuco::experimental::insert_tag>{ | ||
cuco::empty_key<key_type>{-1}, {}, {}, storage_ref_type{sizes[idx], windows + offsets[idx]}}; | ||
|
||
// Each cooperative_groups inserts all elements in `data` into its own subset | ||
for (int i = 0; i < N; i++) { | ||
set_ref.insert(tile, data[i]); | ||
} | ||
} | ||
|
||
// insert a set of keys into a hash set using one cooperative group for each task | ||
template <typename Window, typename Size, typename Offset> | ||
__global__ void find(Window* windows, Size* sizes, Offset* offsets) | ||
{ | ||
namespace cg = cooperative_groups; | ||
|
||
auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block()); | ||
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; | ||
|
||
auto set_ref = ref_type<cuco::experimental::find_tag>{ | ||
cuco::empty_key<key_type>{-1}, {}, {}, storage_ref_type{sizes[idx], windows + offsets[idx]}}; | ||
|
||
__shared__ int result; | ||
if (threadIdx.x == 0) { result = 0; } | ||
__syncthreads(); | ||
|
||
for (int i = 0; i < N; i++) { | ||
auto const found = set_ref.find(tile, data[i]); | ||
// Record if the inserted data has been found | ||
atomicOr(&result, *found != data[i]); | ||
} | ||
__syncthreads(); | ||
|
||
if (threadIdx.x == 0) { | ||
if (result == 0) { printf("Success! Found all inserted elements.\n"); } | ||
} | ||
} | ||
|
||
/** | ||
* @file device_subsets_example.cu | ||
* @brief Demonstrates usage of the static_set device-side APIs. | ||
PointKernel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
* | ||
* static_set provides a non-owning reference which can be used to interact with | ||
* the container from within device code. | ||
*/ | ||
int main() | ||
{ | ||
// Number of subsets | ||
auto constexpr num = 16; | ||
// Sizes of the 16 subsets to be created on the device | ||
auto constexpr subset_sizes = | ||
std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50}; | ||
|
||
auto valid_sizes = std::vector<std::size_t>(num); | ||
std::generate(valid_sizes.begin(), valid_sizes.end(), [&, n = 0]() mutable { | ||
return cuco::experimental::make_window_extent<cg_size, window_size>(subset_sizes[n++]); | ||
}); | ||
|
||
auto const d_sizes = thrust::device_vector<std::size_t>{valid_sizes}; | ||
auto d_offsets = thrust::device_vector<std::size_t>(num); | ||
thrust::exclusive_scan(d_sizes.begin(), d_sizes.end(), d_offsets.begin()); | ||
|
||
auto const num_windows = thrust::reduce(valid_sizes.begin(), valid_sizes.end()); | ||
PointKernel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// One allocation for all subsets | ||
auto d_set_storage = storage_type{num_windows}; | ||
// Initializes the storage with the given sentinel | ||
d_set_storage.initialize(empty_key_sentinel); | ||
|
||
insert<<<1, 128>>>(d_set_storage.data(), d_sizes.data().get(), d_offsets.data().get()); | ||
find<<<1, 128>>>(d_set_storage.data(), d_sizes.data().get(), d_offsets.data().get()); | ||
|
||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
#include <cuco/detail/equal_wrapper.cuh> | ||
#include <cuco/extent.cuh> | ||
#include <cuco/pair.cuh> | ||
#include <cuco/probing_scheme.cuh> | ||
|
||
#include <thrust/distance.h> | ||
#include <thrust/pair.h> | ||
|
@@ -63,12 +64,12 @@ class open_addressing_ref_impl { | |
ProbingScheme>, | ||
"ProbingScheme must inherit from cuco::detail::probing_scheme_base"); | ||
|
||
static_assert(is_window_extent_v<typename StorageRef::extent_type>, | ||
"Extent is not a valid cuco::window_extent"); | ||
static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size, | ||
"Extent has incompatible CG size"); | ||
static_assert(StorageRef::window_size == StorageRef::extent_type::window_size, | ||
"Extent has incompatible window size"); | ||
// static_assert(is_window_extent_v<typename StorageRef::extent_type>, | ||
// "Extent is not a valid cuco::window_extent"); | ||
// static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size, | ||
// "Extent has incompatible CG size"); | ||
// static_assert(StorageRef::window_size == StorageRef::extent_type::window_size, | ||
// "Extent has incompatible window size"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How could we solve the issue where the sum of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Losing these checks isn't ideal. We could create a new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Agreed. That's the complicated part. In general, when users have a pointer and a size on hand. Creating a ref should be as simple as: auto ref = ref_type{&data, size}; That's what I'm trying to achieve for auto set_ref = ref_type<cuco::experimental::find_tag>{
cuco::empty_key<key_type>{-1}, {}, {}, storage_ref_type{sizes[idx], windows + offsets[idx]}}; Enabling those checks enforces users to invoke using extent_type =
decltype(make_window_extent<cg_size, window_size>(std::declval<cuco::experimental::extent<size_t>>()));
auto set_ref = ref_type<cuco::experimental::find_tag>{
cuco::empty_key<key_type>{-1},
{},
{},
aow_storage_ref<key_type,
window_size,
extent_type>{make_valid_extent<cg_size, window_size>(sizes[idx]), windows + offsets[idx]}}; This is way more complex than needed. One solution I can think of is to set the proper data type for sizes array, so instead of: auto valid_sizes = std::vector<std::size_t>(num); Users should get the return type of using extent_type =
decltype(make_window_extent<cg_size, window_size>(std::declval<cuco::experimental::extent<size_t>>()));
auto valid_sizes = std::vector<extent_type>(num); One thing I don't like here is the obscure way that users have to follow to set up the proper extent type. Isn't all those fiddlings around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 100% agree on the unnecessary complexity. It should be as simple as passing a pointer and a size (or a Can we provide an additional ctor with signature We can discuss this in today's dev sync. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We lose the motivation of having a |
||
|
||
public: | ||
using key_type = Key; ///< Key type | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
|
||
namespace cuco { | ||
namespace experimental { | ||
|
||
/** | ||
* @brief Public storage class. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
memory access granularity (which may impact perfomance depending on the size of the slot type)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still like referring to it as "items per thread" or "thread granularity" as it controls how many elements an individual thread processes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed,
items_per_thread
itself is definitely a less abstractive name thanwindow_size
.v.s.
cuco::aow_storage<key_type, window_size> { ... }; // or cuco::window_storage<key_type, window_size> { ... };
Actually, the former one is not bad at all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only confusion that might occur is that
items_per_thread
in e.g. CUB refers to input items per thread, whilst ouritems_per_thread
means slots per thread. Just a minor thing. I'm ok with it.thread_granularity
would remove theitems
part which might be less confusing but also less descriptive. Meh.