Skip to content

Commit

Permalink
Add data structure base classes and cuco::static_set (#278)
Browse files Browse the repository at this point in the history
This is the first PR related to #110.

It introduces the concept of:

- New probing scheme via probing iterator
- Array of Windows storage instead of flat storage to better deal with
memory bandwidth-bound workload when hash collisions are present
- Dynamic and static extent type for efficient probing
- Mixin to encode concurrent device operators
- Synchronous and asynchronous host bulk APIs

This PR also adds `cuco::static_set` to evaluate the new design. For
now, only 2 basic operations, `insert` and `contains`, are supported.

---------

Co-authored-by: Daniel Juenger <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Apr 6, 2023
1 parent 9287726 commit ed620ab
Show file tree
Hide file tree
Showing 39 changed files with 3,768 additions and 7 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ class example_class {

We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables.

### `static_set`

`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information.

#### Examples:
- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c))
- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73))

### `static_map`

`cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information.
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ endfunction(ConfigureBench)
### benchmark sources #############################################################################
###################################################################################################

###################################################################################################
# - static_set benchmarks -------------------------------------------------------------------------
ConfigureBench(STATIC_SET_BENCH
hash_table/static_set/contains_bench.cu
hash_table/static_set/insert_bench.cu
hash_table/static_set/size_bench.cu)

###################################################################################################
# - static_map benchmarks -------------------------------------------------------------------------
ConfigureBench(STATIC_MAP_BENCH
Expand Down
75 changes: 75 additions & 0 deletions benchmarks/hash_table/static_set/contains_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.hpp>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

using namespace cuco::benchmark;
using namespace cuco::utility;

/**
* @brief A benchmark evaluating `cuco::static_set::contains` performance
*/
template <typename Key, typename Dist>
void static_set_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N);
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);

std::size_t const size = num_keys / occupancy;

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
set.insert(keys.begin(), keys.end());

gen.dropout(keys.begin(), keys.end(), matching_rate);

thrust::device_vector<bool> result(num_keys);

state.add_element_count(num_keys);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream());
});
}

NVBENCH_BENCH_TYPES(static_set_contains,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::unique>))
.set_name("static_set_contains_unique_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);

NVBENCH_BENCH_TYPES(static_set_contains,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::unique>))
.set_name("static_set_contains_unique_matching_rate")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
81 changes: 81 additions & 0 deletions benchmarks/hash_table/static_set/insert_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.hpp>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

using namespace cuco::benchmark;
using namespace cuco::utility;

/**
* @brief A benchmark evaluating `cuco::static_set::insert` performance
*/
template <typename Key, typename Dist>
void static_set_insert(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N);
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);

std::size_t const size = num_keys / occupancy;

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
cuco::experimental::static_set<Key> set{
size, cuco::empty_key<Key>{-1}, {}, {}, {}, launch.get_stream()};

timer.start();
set.insert(keys.begin(), keys.end(), launch.get_stream());
timer.stop();
});
}

NVBENCH_BENCH_TYPES(static_set_insert,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::uniform>))
.set_name("static_set_insert_uniform_multiplicity")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);

NVBENCH_BENCH_TYPES(static_set_insert,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::unique>))
.set_name("static_set_insert_unique_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);

NVBENCH_BENCH_TYPES(static_set_insert,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::gaussian>))
.set_name("static_set_insert_gaussian_skew")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_float64_axis("Skew", defaults::SKEW_RANGE);
62 changes: 62 additions & 0 deletions benchmarks/hash_table/static_set/size_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.hpp>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

using namespace cuco::benchmark;
using namespace cuco::utility;

/**
* @brief A benchmark evaluating `cuco::static_set::size` performance
*/
template <typename Key, typename Dist>
void static_set_size(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N);
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);

std::size_t const size = num_keys / occupancy;

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};

set.insert(keys.begin(), keys.end());

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); });
}

NVBENCH_BENCH_TYPES(static_set_size,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<distribution::unique>))
.set_name("static_set_size_unique_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ endfunction(ConfigureExample)
### Example sources ###############################################################################
###################################################################################################

ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
Expand Down
116 changes: 116 additions & 0 deletions examples/static_set/device_ref_example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/static_set.cuh>

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/logical.h>
#include <thrust/sequence.h>

#include <cooperative_groups.h>

#include <cstddef>
#include <iostream>

// insert a set of keys into a hash set using one cooperative group for each task
template <typename SetRef, typename InputIterator>
__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
{
namespace cg = cooperative_groups;

constexpr auto cg_size = SetRef::cg_size;

auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());

int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

while (idx < n) {
set.insert(tile, *(keys + idx));
idx += loop_stride;
}
}

template <typename SetRef, typename InputIterator, typename OutputIterator>
__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found)
{
int64_t const loop_stride = gridDim.x * blockDim.x;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

while (idx < n) {
found[idx] = set.contains(*(keys + idx));
idx += loop_stride;
}
}

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*
*/
int main(void)
{
using Key = int;

// Empty slots are represented by reserved "sentinel" values. These values should be selected such
// that they never occur in your input data.
Key constexpr empty_key_sentinel = -1;

// Number of keys to be inserted
std::size_t constexpr num_keys = 50'000;

// Compute capacity based on a 50% load factor
auto constexpr load_factor = 0.5;
std::size_t const capacity = std::ceil(num_keys / load_factor);

using set_type = cuco::experimental::static_set<Key>;

// Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel.
set_type set{capacity, cuco::empty_key{empty_key_sentinel}};

// Create a sequence of keys {0, 1, 2, .., i}
thrust::device_vector<Key> keys(num_keys);
thrust::sequence(keys.begin(), keys.end(), 0);

// Insert the first half of the keys into the set
set.insert(keys.begin(), keys.begin() + num_keys / 2);

// Insert the second half of keys using a custom CUDA kernel.
custom_cooperative_insert<<<128, 128>>>(
set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2);

// Storage for result
thrust::device_vector<bool> found(num_keys);

// Check if all keys are now contained in the set. Note that we pass a reference that already has
// the `contains` operator.
// In general, using two or more reference objects to the same container but with
// a different set of operators concurrently is undefined behavior.
// This does not apply here since the two kernels do not overlap.
custom_contains<<<128, 128>>>(
set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin());

// Verify that all keys have been found
bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());

if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }

return 0;
}
Loading

0 comments on commit ed620ab

Please sign in to comment.