Skip to content

Commit

Permalink
Merge branch 'dev' into for_each
Browse files Browse the repository at this point in the history
  • Loading branch information
srinivasyadav18 authored Jul 31, 2024
2 parents eeecedb + d92aefb commit c500e5a
Show file tree
Hide file tree
Showing 6 changed files with 409 additions and 2 deletions.
180 changes: 180 additions & 0 deletions include/cuco/detail/dynamic_map/dynamic_map.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/detail/bitwise_compare.cuh>
#include <cuco/detail/static_map/kernels.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/operator.hpp>
#include <cuco/static_map_ref.cuh>

#include <cuda/stream_ref>

#include <algorithm>
#include <cstddef>

namespace cuco {
namespace experimental {

template <typename Key,
typename T,
typename Extent,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename Allocator,
typename Storage>
constexpr dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
dynamic_map(Extent initial_capacity,
empty_key<Key> empty_key_sentinel,
empty_value<T> empty_value_sentinel,
KeyEqual const& pred,
ProbingScheme const& probing_scheme,
cuda_thread_scope<Scope> scope,
Storage storage,
Allocator const& alloc,
cuda::stream_ref stream)
: size_{0},
capacity_{initial_capacity},
min_insert_size_{static_cast<size_type>(1E4)},
max_load_factor_{0.60},
alloc_{alloc}
{
submaps_.push_back(
std::make_unique<
cuco::static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>>(
initial_capacity,
empty_key_sentinel,
empty_value_sentinel,
pred,
probing_scheme,
scope,
storage,
alloc,
stream));
}

template <typename Key,
typename T,
typename Extent,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename Allocator,
typename Storage>
template <typename InputIt>
void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
InputIt first, InputIt last, cuda::stream_ref stream)
{
auto num_to_insert = cuco::detail::distance(first, last);
this->reserve(size_ + num_to_insert, stream);

uint32_t submap_idx = 0;
while (num_to_insert > 0) {
auto& cur = submaps_[submap_idx];

auto capacity_remaining = max_load_factor_ * cur->capacity() - cur->size();
// If we are tying to insert some of the remaining keys into this submap, we can insert
// only if we meet the minimum insert size.
if (capacity_remaining >= min_insert_size_) {
auto const n = std::min(static_cast<detail::index_type>(capacity_remaining), num_to_insert);

std::size_t h_num_successes = cur->insert(first, first + n, stream);

size_ += h_num_successes;
first += n;
num_to_insert -= n;
}
submap_idx++;
}
}

template <typename Key,
typename T,
typename Extent,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename Allocator,
typename Storage>
void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::reserve(
size_type n, cuda::stream_ref stream)
{
size_type num_elements_remaining = n;
uint32_t submap_idx = 0;
while (num_elements_remaining > 0) {
std::size_t submap_capacity;

// if the submap already exists
if (submap_idx < submaps_.size()) {
submap_capacity = submaps_[submap_idx]->capacity();
}
// if the submap does not exist yet, create it
else {
empty_key<Key> empty_key_sentinel{submaps_.front()->empty_key_sentinel()};
empty_value<T> empty_value_sentinel{submaps_.front()->empty_value_sentinel()};

submap_capacity = capacity_;
submaps_.push_back(std::make_unique<map_type>(submap_capacity,
empty_key_sentinel,
empty_value_sentinel,
KeyEqual{},
ProbingScheme{},
cuda_thread_scope<Scope>{},
Storage{},
alloc_,
stream));
capacity_ *= 2;
}

num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_;
submap_idx++;
}
}

template <typename Key,
typename T,
typename Extent,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename Allocator,
typename Storage>
template <typename InputIt, typename OutputIt>
void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
{
auto num_keys = cuco::detail::distance(first, last);
std::size_t traversed = 0;
uint32_t submap_idx = 0;
while (num_keys > 0 && submap_idx < submaps_.size()) {
const auto& cur = submaps_[submap_idx];
const size_t cur_size = cur->size();
const size_t num_keys_to_process =
std::min(static_cast<detail::index_type>(cur_size), num_keys);
CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));

cur->contains(first, first + num_keys_to_process, output_begin + traversed, stream);

traversed += num_keys_to_process;
num_keys -= num_keys_to_process;
submap_idx++;
first += num_keys_to_process;
}
}

} // namespace experimental
} // namespace cuco
4 changes: 2 additions & 2 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ template <typename InputIt>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream)
{
return this->insert_or_assign_async(first, last, stream);
this->insert_or_assign_async(first, last, stream);
stream.wait();
}

Expand Down Expand Up @@ -363,7 +363,7 @@ template <typename InputIt, typename Op>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream)
{
return this->insert_or_apply_async(first, last, op, stream);
this->insert_or_apply_async(first, last, op, stream);
stream.wait();
}

Expand Down
138 changes: 138 additions & 0 deletions include/cuco/dynamic_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,143 @@

namespace cuco {

namespace experimental {
/**
* @brief A GPU-accelerated, unordered, associative container of key-value
* pairs with unique keys.
*
* This container automatically grows its capacity as necessary until device memory runs out.
*
* @tparam Key The type of the keys.
* @tparam T The type of the mapped values.
* @tparam Extent The type representing the extent of the container.
* @tparam Scope The thread scope for the container's operations.
* @tparam KeyEqual The equality comparison function for keys.
* @tparam ProbingScheme The probing scheme for resolving hash collisions.
* @tparam Allocator The allocator used for memory management.
* @tparam Storage The storage policy for the container.
*/
template <class Key,
class T,
class Extent = cuco::extent<std::size_t>,
cuda::thread_scope Scope = cuda::thread_scope_device,
class KeyEqual = thrust::equal_to<Key>,
class ProbingScheme = cuco::linear_probing<4, // CG size
cuco::default_hash_function<Key>>,
class Allocator = cuco::cuda_allocator<cuco::pair<Key, T>>,
class Storage = cuco::storage<1>>
class dynamic_map {
using map_type = static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>;

public:
static constexpr auto thread_scope = map_type::thread_scope; ///< CUDA thread scope

using key_type = typename map_type::key_type; ///< Key type
using value_type = typename map_type::value_type; ///< Key-value pair type
using size_type = typename map_type::size_type; ///< Size type
using key_equal = typename map_type::key_equal; ///< Key equality comparator type
using mapped_type = T; ///< Payload type

dynamic_map(dynamic_map const&) = delete;
dynamic_map& operator=(dynamic_map const&) = delete;

dynamic_map(dynamic_map&&) = default; ///< Move constructor

/**
* @brief Replaces the contents of the container with another container.
*
* @return Reference of the current map object
*/
dynamic_map& operator=(dynamic_map&&) = default;
~dynamic_map() = default;

/**
* @brief Constructs a dynamically-sized map with erase capability.
*
* The capacity of the map will automatically increase as the user adds key/value pairs using
* `insert`.
*
* Capacity increases by a factor of growth_factor each time the size of the map exceeds a
* threshold occupancy. The performance of `find` and `contains` gradually decreases each time the
* map's capacity grows.
*
* @param initial_capacity The initial number of slots in the map
* @param empty_key_sentinel The reserved key value for empty slots
* @param empty_value_sentinel The reserved mapped value for empty slots
* @param pred Key equality binary predicate
* @param probing_scheme Probing scheme
* @param scope The scope in which operations will be performed
* @param storage Kind of storage to use
* @param alloc Allocator used for allocating device storage
* @param stream CUDA stream used to initialize the map
*/
constexpr dynamic_map(Extent initial_capacity,
empty_key<Key> empty_key_sentinel,
empty_value<T> empty_value_sentinel,
KeyEqual const& pred = {},
ProbingScheme const& probing_scheme = {},
cuda_thread_scope<Scope> scope = {},
Storage storage = {},
Allocator const& alloc = {},
cuda::stream_ref stream = {});

/**
* @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
*
* If there is already enough space for `n` key/value pairs, the capacity remains the same.
*
* @param n The number of key value pairs for which there must be space
* @param stream Stream used for executing the kernels
*/
void reserve(size_type n, cuda::stream_ref stream);

/**
* @brief Inserts all key/value pairs in the range `[first, last)`.
*
* If multiple keys in `[first, last)` compare equal, it is unspecified which
* element is inserted.
*
* @tparam InputIt Device accessible input iterator whose `value_type` is
* convertible to the map's `value_type`
* @param first Beginning of the sequence of key/value pairs
* @param last End of the sequence of key/value pairs
* @param stream Stream used for executing the kernels
*/
template <typename InputIt>
void insert(InputIt first, InputIt last, cuda::stream_ref stream = {});

/**
* @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
*
* Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
*
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator whose `value_type` is
* convertible to the map's `mapped_type`
*
* @param first Beginning of the sequence of keys
* @param last End of the sequence of keys
* @param output_begin Beginning of the sequence of booleans for the presence of each key
* @param stream Stream used for executing the kernels
*/
template <typename InputIt, typename OutputIt>
void contains(InputIt first,
InputIt last,
OutputIt output_begin,
cuda::stream_ref stream = {}) const;

private:
size_type size_{}; ///< Number of keys in the map
size_type capacity_{}; ///< Maximum number of keys that can be inserted

std::vector<std::unique_ptr<map_type>> submaps_; ///< vector of pointers to each submap
size_type min_insert_size_{}; ///< min remaining capacity of submap for insert
float max_load_factor_{}; ///< Maximum load factor
Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage
};

} // namespace experimental

/**
* @brief A GPU-accelerated, unordered, associative container of key-value
* pairs with unique keys
Expand Down Expand Up @@ -361,3 +498,4 @@ class dynamic_map {
} // namespace cuco

#include <cuco/detail/dynamic_map.inl>
#include <cuco/detail/dynamic_map/dynamic_map.inl>
12 changes: 12 additions & 0 deletions include/cuco/static_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,18 @@ class static_map {
mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload
};

namespace experimental {
template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
class dynamic_map;
}

template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
class dynamic_map;

Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ ConfigureTest(STATIC_MAP_TEST
# - dynamic_map tests -----------------------------------------------------------------------------
ConfigureTest(DYNAMIC_MAP_TEST
dynamic_map/unique_sequence_test.cu
dynamic_map/unique_sequence_test_experimental.cu
dynamic_map/erase_test.cu)

###################################################################################################
Expand Down
Loading

0 comments on commit c500e5a

Please sign in to comment.