Skip to content

Commit

Permalink
Enable fast_int functionality for dynamic extent (#315)
Browse files Browse the repository at this point in the history
Adds `fast_int` functionality to `cuco::extent`.

Related to #284
  • Loading branch information
sleeepyjack authored Aug 1, 2023
1 parent fd7263c commit 3fe5704
Show file tree
Hide file tree
Showing 15 changed files with 309 additions and 195 deletions.
126 changes: 126 additions & 0 deletions include/cuco/detail/extent/extent.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/detail/error.hpp>
#include <cuco/detail/prime.hpp> // TODO move to detail/extent/
#include <cuco/detail/utils.hpp>
#include <cuco/utility/fast_int.cuh>

#include <type_traits>

namespace cuco {
namespace experimental {

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N = dynamic_extent>
struct window_extent {
using value_type = SizeType; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

__host__ __device__ constexpr value_type value() const noexcept { return N; }
__host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); }

private:
__host__ __device__ explicit constexpr window_extent() noexcept {}
__host__ __device__ explicit constexpr window_extent(SizeType) noexcept {}

template <int32_t CGSize_, int32_t WindowSize_, typename SizeType_, std::size_t N_>
friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType>
struct window_extent<CGSize, WindowSize, SizeType, dynamic_extent>
: cuco::utility::fast_int<SizeType> {
using value_type =
typename cuco::utility::fast_int<SizeType>::fast_int::value_type; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

private:
using cuco::utility::fast_int<SizeType>::fast_int;

template <int32_t CGSize_, int32_t WindowSize_, typename SizeType_, std::size_t N_>
friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
};

template <typename Container, typename SizeType, std::size_t N>
[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext)
{
return make_window_extent<Container::cg_size, Container::window_size>(ext);
}

template <typename Container>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
{
return make_window_extent<Container::cg_size, Container::window_size>(size);
}

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext)
{
auto constexpr max_prime = cuco::detail::primes.back();
auto constexpr max_value =
(static_cast<uint64_t>(std::numeric_limits<SizeType>::max()) < max_prime)
? std::numeric_limits<SizeType>::max()
: static_cast<SizeType>(max_prime);
auto const size = SDIV(ext, CGSize * WindowSize);
if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); }

if constexpr (N == dynamic_extent) {
return window_extent<CGSize, WindowSize, SizeType>{static_cast<SizeType>(
*cuco::detail::lower_bound(
cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
CGSize)};
}
if constexpr (N != dynamic_extent) {
return window_extent<CGSize,
WindowSize,
SizeType,
static_cast<std::size_t>(
*cuco::detail::lower_bound(cuco::detail::primes.begin(),
cuco::detail::primes.end(),
static_cast<uint64_t>(size)) *
CGSize)>{};
}
}

template <int32_t CGSize, int32_t WindowSize>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
{
return static_cast<std::size_t>(make_window_extent<CGSize, WindowSize>(extent{size}));
}

namespace detail {

template <typename...>
struct is_window_extent : std::false_type {
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
struct is_window_extent<window_extent<CGSize, WindowSize, SizeType, N>> : std::true_type {
};

template <typename T>
inline constexpr bool is_window_extent_v = is_window_extent<T>::value;

} // namespace detail

} // namespace experimental
} // namespace cuco
6 changes: 3 additions & 3 deletions include/cuco/detail/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ class open_addressing_impl {
using key_type = Key; ///< Key type
using value_type = Value; ///< The storage value type, NOT payload type
/// Extent type
using extent_type = decltype(make_valid_extent<cg_size, window_size>(std::declval<Extent>()));
using extent_type = decltype(make_window_extent<open_addressing_impl>(std::declval<Extent>()));
using size_type = typename extent_type::value_type; ///< Size type
using key_equal = KeyEqual; ///< Key equality comparator type
using storage_type =
Expand All @@ -103,7 +103,7 @@ class open_addressing_impl {
* capacity, sentinel values and CUDA stream.
*
* @note The actual capacity depends on the given `capacity`, the probing scheme, CG size, and the
* window size and it's computed via `make_valid_extent` factory. Insert operations will not
* window size and it is computed via the `make_window_extent` factory. Insert operations will not
* automatically grow the container. Attempting to insert more unique keys than the capacity of
* the container results in undefined behavior.
* @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert
Expand All @@ -130,7 +130,7 @@ class open_addressing_impl {
empty_slot_sentinel_{empty_slot_sentinel},
predicate_{pred},
probing_scheme_{probing_scheme},
storage_{make_valid_extent<cg_size, window_size>(capacity), alloc}
storage_{make_window_extent<open_addressing_impl>(capacity), alloc}
{
this->clear_async(stream);
}
Expand Down
24 changes: 16 additions & 8 deletions include/cuco/detail/open_addressing_ref_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <cuco/detail/equal_wrapper.cuh>
#include <cuco/extent.cuh>
#include <cuco/pair.cuh>

#include <thrust/distance.h>
Expand Down Expand Up @@ -62,6 +63,13 @@ class open_addressing_ref_impl {
ProbingScheme>,
"ProbingScheme must inherit from cuco::detail::probing_scheme_base");

static_assert(is_window_extent_v<typename StorageRef::extent_type>,
"Extent is not a valid cuco::window_extent");
static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size,
"Extent has incompatible CG size");
static_assert(StorageRef::window_size == StorageRef::extent_type::window_size,
"Extent has incompatible window size");

public:
using key_type = Key; ///< Key type
using probing_scheme_type = ProbingScheme; ///< Type of probing scheme
Expand Down Expand Up @@ -138,7 +146,7 @@ class open_addressing_ref_impl {
Predicate const& predicate) noexcept
{
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
auto probing_iter = probing_scheme_(key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down Expand Up @@ -180,7 +188,7 @@ class open_addressing_ref_impl {
value_type const& value,
Predicate const& predicate) noexcept
{
auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down Expand Up @@ -244,7 +252,7 @@ class open_addressing_ref_impl {
Predicate const& predicate) noexcept
{
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
auto probing_iter = probing_scheme_(key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down Expand Up @@ -301,7 +309,7 @@ class open_addressing_ref_impl {
value_type const& value,
Predicate const& predicate) noexcept
{
auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down Expand Up @@ -375,7 +383,7 @@ class open_addressing_ref_impl {
Predicate const& predicate) const noexcept
{
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
auto probing_iter = probing_scheme_(key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());

while (true) {
// TODO atomic_ref::load if insert operator is present
Expand Down Expand Up @@ -413,7 +421,7 @@ class open_addressing_ref_impl {
ProbeKey const& key,
Predicate const& predicate) const noexcept
{
auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down Expand Up @@ -455,7 +463,7 @@ class open_addressing_ref_impl {
Predicate const& predicate) const noexcept
{
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
auto probing_iter = probing_scheme_(key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());

while (true) {
// TODO atomic_ref::load if insert operator is present
Expand Down Expand Up @@ -497,7 +505,7 @@ class open_addressing_ref_impl {
ProbeKey const& key,
Predicate const& predicate) const noexcept
{
auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows());
auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];
Expand Down
27 changes: 17 additions & 10 deletions include/cuco/detail/probing_scheme_impl.inl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#pragma once

#include <cuco/detail/utils.cuh>

namespace cuco {
namespace experimental {
namespace detail {
Expand Down Expand Up @@ -97,9 +99,10 @@ __host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
ProbeKey const& probe_key, Extent upper_bound) const noexcept
{
using size_type = typename Extent::value_type;
return detail::probing_iterator<Extent>{static_cast<size_type>(hash_(probe_key) % upper_bound),
1, // step size is 1
upper_bound};
return detail::probing_iterator<Extent>{
cuco::detail::sanitize_hash<size_type>(hash_(probe_key)) % upper_bound,
1, // step size is 1
upper_bound};
}

template <int32_t CGSize, typename Hash>
Expand All @@ -111,7 +114,7 @@ __host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
{
using size_type = typename Extent::value_type;
return detail::probing_iterator<Extent>{
static_cast<size_type>((hash_(probe_key) + g.thread_rank()) % upper_bound),
cuco::detail::sanitize_hash<size_type>(hash_(probe_key) + g.thread_rank()) % upper_bound,
cg_size,
upper_bound};
}
Expand All @@ -130,9 +133,10 @@ __host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operato
{
using size_type = typename Extent::value_type;
return detail::probing_iterator<Extent>{
static_cast<size_type>(hash1_(probe_key) % upper_bound),
static_cast<size_type>(hash2_(probe_key) % (upper_bound - 1) +
1), // step size in range [1, prime - 1]
cuco::detail::sanitize_hash<size_type>(hash1_(probe_key)) % upper_bound,
max(size_type{1},
cuco::detail::sanitize_hash<size_type>(hash2_(probe_key)) %
upper_bound), // step size in range [1, prime - 1]
upper_bound};
}

Expand All @@ -145,9 +149,12 @@ __host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operato
{
using size_type = typename Extent::value_type;
return detail::probing_iterator<Extent>{
static_cast<size_type>((hash1_(probe_key) + g.thread_rank()) % upper_bound),
static_cast<size_type>((hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size),
upper_bound};
cuco::detail::sanitize_hash<size_type>(hash1_(probe_key) + g.thread_rank()) % upper_bound,
static_cast<size_type>((cuco::detail::sanitize_hash<size_type>(hash2_(probe_key)) %
(upper_bound.value() / cg_size - 1) +
1) *
cg_size),
upper_bound}; // TODO use fast_int operator
}
} // namespace experimental
} // namespace cuco
20 changes: 15 additions & 5 deletions include/cuco/detail/storage/aow_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class aow_storage_base : public storage_base<Extent> {
*
* @return The total number of slot windows
*/
[[nodiscard]] __host__ __device__ constexpr extent_type num_windows() const noexcept
[[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept
{
return storage_base<Extent>::capacity();
}
Expand All @@ -77,9 +77,19 @@ class aow_storage_base : public storage_base<Extent> {
*
* @return The total number of slots
*/
[[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept
[[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept
{
return storage_base<Extent>::capacity().template multiply<window_size>();
return storage_base<Extent>::capacity() * window_size;
}

/**
* @brief Gets the window extent of the current storage.
*
* @return The window extent.
*/
[[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept
{
return storage_base<Extent>::extent();
}
};

Expand Down Expand Up @@ -278,7 +288,7 @@ class aow_storage : public aow_storage_base<WindowSize, T, Extent> {
* @brief Constructor of AoW storage.
*
* @note The input `size` should be exclusively determined by the return value of
* `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and
* `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and
* the storage.
*
* @param size Number of windows to (de)allocate
Expand Down Expand Up @@ -325,7 +335,7 @@ class aow_storage : public aow_storage_base<WindowSize, T, Extent> {
*/
[[nodiscard]] constexpr ref_type ref() const noexcept
{
return ref_type{this->num_windows(), this->data()};
return ref_type{this->window_extent(), this->data()};
}

/**
Expand Down
6 changes: 3 additions & 3 deletions include/cuco/detail/storage/counter_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace detail {
template <typename SizeType, cuda::thread_scope Scope, typename Allocator>
class counter_storage : public storage_base<cuco::experimental::extent<SizeType, 1>> {
public:
using storage_base<cuco::experimental::extent<SizeType, 1>>::capacity_; ///< Storage size
using storage_base<cuco::experimental::extent<SizeType, 1>>::capacity; ///< Storage size

using size_type = SizeType; ///< Size type
using value_type = cuda::atomic<size_type, Scope>; ///< Type of the counter
Expand All @@ -56,8 +56,8 @@ class counter_storage : public storage_base<cuco::experimental::extent<SizeType,
: storage_base<cuco::experimental::extent<SizeType, 1>>{cuco::experimental::extent<size_type,
1>{}},
allocator_{allocator},
counter_deleter_{capacity_, allocator_},
counter_{allocator_.allocate(capacity_), counter_deleter_}
counter_deleter_{this->capacity(), allocator_},
counter_{allocator_.allocate(this->capacity()), counter_deleter_}
{
}

Expand Down
Loading

0 comments on commit 3fe5704

Please sign in to comment.