-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add data structure base classes and
cuco::static_set
(#278)
This is the first PR related to #110. It introduces the concept of: - New probing scheme via probing iterator - Array of Windows storage instead of flat storage to better deal with memory bandwidth-bound workload when hash collisions are present - Dynamic and static extent type for efficient probing - Mixin to encode concurrent device operators - Synchronous and asynchronous host bulk APIs This PR also adds `cuco::static_set` to evaluate the new design. For now, only 2 basic operations, `insert` and `contains`, are supported. --------- Co-authored-by: Daniel Juenger <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
9287726
commit ed620ab
Showing
39 changed files
with
3,768 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <defaults.hpp> | ||
#include <utils.hpp> | ||
|
||
#include <cuco/static_set.cuh> | ||
#include <cuco/utility/key_generator.hpp> | ||
|
||
#include <nvbench/nvbench.cuh> | ||
|
||
#include <thrust/device_vector.h> | ||
|
||
using namespace cuco::benchmark; | ||
using namespace cuco::utility; | ||
|
||
/** | ||
* @brief A benchmark evaluating `cuco::static_set::contains` performance | ||
*/ | ||
template <typename Key, typename Dist> | ||
void static_set_contains(nvbench::state& state, nvbench::type_list<Key, Dist>) | ||
{ | ||
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); | ||
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); | ||
auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); | ||
|
||
std::size_t const size = num_keys / occupancy; | ||
|
||
thrust::device_vector<Key> keys(num_keys); | ||
|
||
key_generator gen; | ||
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end()); | ||
|
||
cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}}; | ||
set.insert(keys.begin(), keys.end()); | ||
|
||
gen.dropout(keys.begin(), keys.end(), matching_rate); | ||
|
||
thrust::device_vector<bool> result(num_keys); | ||
|
||
state.add_element_count(num_keys); | ||
|
||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream()); | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(static_set_contains, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::unique>)) | ||
.set_name("static_set_contains_unique_occupancy") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); | ||
|
||
NVBENCH_BENCH_TYPES(static_set_contains, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::unique>)) | ||
.set_name("static_set_contains_unique_matching_rate") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <defaults.hpp> | ||
#include <utils.hpp> | ||
|
||
#include <cuco/static_set.cuh> | ||
#include <cuco/utility/key_generator.hpp> | ||
|
||
#include <nvbench/nvbench.cuh> | ||
|
||
#include <thrust/device_vector.h> | ||
|
||
using namespace cuco::benchmark; | ||
using namespace cuco::utility; | ||
|
||
/** | ||
* @brief A benchmark evaluating `cuco::static_set::insert` performance | ||
*/ | ||
template <typename Key, typename Dist> | ||
void static_set_insert(nvbench::state& state, nvbench::type_list<Key, Dist>) | ||
{ | ||
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); | ||
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); | ||
|
||
std::size_t const size = num_keys / occupancy; | ||
|
||
thrust::device_vector<Key> keys(num_keys); | ||
|
||
key_generator gen; | ||
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end()); | ||
|
||
state.add_element_count(num_keys); | ||
|
||
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, | ||
[&](nvbench::launch& launch, auto& timer) { | ||
cuco::experimental::static_set<Key> set{ | ||
size, cuco::empty_key<Key>{-1}, {}, {}, {}, launch.get_stream()}; | ||
|
||
timer.start(); | ||
set.insert(keys.begin(), keys.end(), launch.get_stream()); | ||
timer.stop(); | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(static_set_insert, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::uniform>)) | ||
.set_name("static_set_insert_uniform_multiplicity") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); | ||
|
||
NVBENCH_BENCH_TYPES(static_set_insert, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::unique>)) | ||
.set_name("static_set_insert_unique_occupancy") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); | ||
|
||
NVBENCH_BENCH_TYPES(static_set_insert, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::gaussian>)) | ||
.set_name("static_set_insert_gaussian_skew") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_float64_axis("Skew", defaults::SKEW_RANGE); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <defaults.hpp> | ||
#include <utils.hpp> | ||
|
||
#include <cuco/static_set.cuh> | ||
#include <cuco/utility/key_generator.hpp> | ||
|
||
#include <nvbench/nvbench.cuh> | ||
|
||
#include <thrust/device_vector.h> | ||
|
||
using namespace cuco::benchmark; | ||
using namespace cuco::utility; | ||
|
||
/** | ||
* @brief A benchmark evaluating `cuco::static_set::size` performance | ||
*/ | ||
template <typename Key, typename Dist> | ||
void static_set_size(nvbench::state& state, nvbench::type_list<Key, Dist>) | ||
{ | ||
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); | ||
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); | ||
|
||
std::size_t const size = num_keys / occupancy; | ||
|
||
thrust::device_vector<Key> keys(num_keys); | ||
|
||
key_generator gen; | ||
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end()); | ||
|
||
state.add_element_count(num_keys); | ||
|
||
cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}}; | ||
|
||
set.insert(keys.begin(), keys.end()); | ||
|
||
state.exec(nvbench::exec_tag::sync, | ||
[&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); }); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(static_set_size, | ||
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, | ||
nvbench::type_list<distribution::unique>)) | ||
.set_name("static_set_size_unique_occupancy") | ||
.set_type_axes_names({"Key", "Distribution"}) | ||
.set_max_noise(defaults::MAX_NOISE) | ||
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
/* | ||
* Copyright (c) 2022-2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cuco/static_set.cuh> | ||
|
||
#include <thrust/device_vector.h> | ||
#include <thrust/functional.h> | ||
#include <thrust/logical.h> | ||
#include <thrust/sequence.h> | ||
|
||
#include <cooperative_groups.h> | ||
|
||
#include <cstddef> | ||
#include <iostream> | ||
|
||
// insert a set of keys into a hash set using one cooperative group for each task | ||
template <typename SetRef, typename InputIterator> | ||
__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) | ||
{ | ||
namespace cg = cooperative_groups; | ||
|
||
constexpr auto cg_size = SetRef::cg_size; | ||
|
||
auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block()); | ||
|
||
int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; | ||
int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; | ||
|
||
while (idx < n) { | ||
set.insert(tile, *(keys + idx)); | ||
idx += loop_stride; | ||
} | ||
} | ||
|
||
template <typename SetRef, typename InputIterator, typename OutputIterator> | ||
__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found) | ||
{ | ||
int64_t const loop_stride = gridDim.x * blockDim.x; | ||
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; | ||
|
||
while (idx < n) { | ||
found[idx] = set.contains(*(keys + idx)); | ||
idx += loop_stride; | ||
} | ||
} | ||
|
||
/** | ||
* @file device_reference_example.cu | ||
* @brief Demonstrates usage of the static_set device-side APIs. | ||
* | ||
* static_set provides a non-owning reference which can be used to interact with | ||
* the container from within device code. | ||
* | ||
*/ | ||
int main(void) | ||
{ | ||
using Key = int; | ||
|
||
// Empty slots are represented by reserved "sentinel" values. These values should be selected such | ||
// that they never occur in your input data. | ||
Key constexpr empty_key_sentinel = -1; | ||
|
||
// Number of keys to be inserted | ||
std::size_t constexpr num_keys = 50'000; | ||
|
||
// Compute capacity based on a 50% load factor | ||
auto constexpr load_factor = 0.5; | ||
std::size_t const capacity = std::ceil(num_keys / load_factor); | ||
|
||
using set_type = cuco::experimental::static_set<Key>; | ||
|
||
// Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel. | ||
set_type set{capacity, cuco::empty_key{empty_key_sentinel}}; | ||
|
||
// Create a sequence of keys {0, 1, 2, .., i} | ||
thrust::device_vector<Key> keys(num_keys); | ||
thrust::sequence(keys.begin(), keys.end(), 0); | ||
|
||
// Insert the first half of the keys into the set | ||
set.insert(keys.begin(), keys.begin() + num_keys / 2); | ||
|
||
// Insert the second half of keys using a custom CUDA kernel. | ||
custom_cooperative_insert<<<128, 128>>>( | ||
set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2); | ||
|
||
// Storage for result | ||
thrust::device_vector<bool> found(num_keys); | ||
|
||
// Check if all keys are now contained in the set. Note that we pass a reference that already has | ||
// the `contains` operator. | ||
// In general, using two or more reference objects to the same container but with | ||
// a different set of operators concurrently is undefined behavior. | ||
// This does not apply here since the two kernels do not overlap. | ||
custom_contains<<<128, 128>>>( | ||
set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin()); | ||
|
||
// Verify that all keys have been found | ||
bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>()); | ||
|
||
if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } | ||
|
||
return 0; | ||
} |
Oops, something went wrong.