From ed620ab4b885676991699c5823a802fd0ad90f99 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 5 Apr 2023 17:45:56 -0700
Subject: [PATCH] Add data structure base classes and `cuco::static_set` (#278)

This is the first PR related to #110.

It introduces the concept of:

- New probing scheme via probing iterator
- Array of Windows storage instead of flat storage to better deal with
memory bandwidth-bound workload when hash collisions are present
- Dynamic and static extent type for efficient probing
- Mixin to encode concurrent device operators
- Synchronous and asynchronous host bulk APIs

This PR also adds `cuco::static_set` to evaluate the new design. For
now, only 2 basic operations, `insert` and `contains`, are supported.

---------

Co-authored-by: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 README.md                                     |   8 +
 benchmarks/CMakeLists.txt                     |   7 +
 .../hash_table/static_set/contains_bench.cu   |  75 ++++
 .../hash_table/static_set/insert_bench.cu     |  81 ++++
 .../hash_table/static_set/size_bench.cu       |  62 ++++
 examples/CMakeLists.txt                       |   2 +
 examples/static_set/device_ref_example.cu     | 116 ++++++
 examples/static_set/host_bulk_example.cu      |  72 ++++
 include/cuco/detail/equal_wrapper.cuh         |  95 +++++
 include/cuco/detail/operator.inl              |  59 +++
 include/cuco/detail/prime.hpp                 |   1 +
 include/cuco/detail/probing_scheme_base.cuh   |  42 +++
 include/cuco/detail/probing_scheme_impl.inl   | 146 ++++++++
 include/cuco/detail/static_set/kernels.cuh    | 287 +++++++++++++++
 include/cuco/detail/static_set/static_set.inl | 249 +++++++++++++
 .../cuco/detail/static_set/static_set_ref.inl | 348 ++++++++++++++++++
 include/cuco/detail/storage/aow_storage.cuh   | 247 +++++++++++++
 .../cuco/detail/storage/counter_storage.cuh   | 112 ++++++
 include/cuco/detail/storage/kernels.cuh       |  55 +++
 include/cuco/detail/storage/storage.cuh       |  62 ++++
 include/cuco/detail/storage/storage_base.cuh  |  91 +++++
 include/cuco/detail/tuning.cuh                |  27 ++
 include/cuco/detail/utils.cuh                 |  36 +-
 include/cuco/detail/utils.hpp                 |  43 ++-
 include/cuco/extent.cuh                       | 149 ++++++++
 include/cuco/operator.hpp                     |  40 ++
 include/cuco/probing_scheme.cuh               | 153 ++++++++
 include/cuco/static_set.cuh                   | 289 +++++++++++++++
 include/cuco/static_set_ref.cuh               | 131 +++++++
 include/cuco/storage.cuh                      |  47 +++
 include/cuco/utility/traits.hpp               |   8 +-
 tests/CMakeLists.txt                          |  17 +-
 tests/static_set/capacity_test.cu             | 112 ++++++
 tests/static_set/heterogeneous_lookup_test.cu | 120 ++++++
 tests/static_set/large_input_test.cu          |  88 +++++
 tests/static_set/size_test.cu                 |  42 +++
 tests/static_set/unique_sequence_test.cu      | 110 ++++++
 tests/utility/extent_test.cu                  |  56 +++
 tests/utility/storage_test.cu                 |  90 +++++
 39 files changed, 3768 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/hash_table/static_set/contains_bench.cu
 create mode 100644 benchmarks/hash_table/static_set/insert_bench.cu
 create mode 100644 benchmarks/hash_table/static_set/size_bench.cu
 create mode 100644 examples/static_set/device_ref_example.cu
 create mode 100644 examples/static_set/host_bulk_example.cu
 create mode 100644 include/cuco/detail/equal_wrapper.cuh
 create mode 100644 include/cuco/detail/operator.inl
 create mode 100644 include/cuco/detail/probing_scheme_base.cuh
 create mode 100644 include/cuco/detail/probing_scheme_impl.inl
 create mode 100644 include/cuco/detail/static_set/kernels.cuh
 create mode 100644 include/cuco/detail/static_set/static_set.inl
 create mode 100644 include/cuco/detail/static_set/static_set_ref.inl
 create mode 100644 include/cuco/detail/storage/aow_storage.cuh
 create mode 100644 include/cuco/detail/storage/counter_storage.cuh
 create mode 100644 include/cuco/detail/storage/kernels.cuh
 create mode 100644 include/cuco/detail/storage/storage.cuh
 create mode 100644 include/cuco/detail/storage/storage_base.cuh
 create mode 100644 include/cuco/detail/tuning.cuh
 create mode 100644 include/cuco/extent.cuh
 create mode 100644 include/cuco/operator.hpp
 create mode 100644 include/cuco/probing_scheme.cuh
 create mode 100644 include/cuco/static_set.cuh
 create mode 100644 include/cuco/static_set_ref.cuh
 create mode 100644 include/cuco/storage.cuh
 create mode 100644 tests/static_set/capacity_test.cu
 create mode 100644 tests/static_set/heterogeneous_lookup_test.cu
 create mode 100644 tests/static_set/large_input_test.cu
 create mode 100644 tests/static_set/size_test.cu
 create mode 100644 tests/static_set/unique_sequence_test.cu
 create mode 100644 tests/utility/extent_test.cu
 create mode 100644 tests/utility/storage_test.cu

diff --git a/README.md b/README.md
index 9f3984bc5..ef294f838 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,14 @@ class example_class {
 
 We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. 
 
+### `static_set`
+
+`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information.
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73))
+
 ### `static_map`
 
 `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 13f1e3be0..750c9be86 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -46,6 +46,13 @@ endfunction(ConfigureBench)
 ### benchmark sources #############################################################################
 ###################################################################################################
 
+###################################################################################################
+# - static_set benchmarks -------------------------------------------------------------------------
+ConfigureBench(STATIC_SET_BENCH
+  hash_table/static_set/contains_bench.cu
+  hash_table/static_set/insert_bench.cu
+  hash_table/static_set/size_bench.cu)
+
 ###################################################################################################
 # - static_map benchmarks -------------------------------------------------------------------------
 ConfigureBench(STATIC_MAP_BENCH
diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu
new file mode 100644
index 000000000..b0c0f34f4
--- /dev/null
+++ b/benchmarks/hash_table/static_set/contains_bench.cu
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::contains` performance
+ */
+template <typename Key, typename Dist>
+void static_set_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<bool> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_matching_rate")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu
new file mode 100644
index 000000000..cb5dcf1f8
--- /dev/null
+++ b/benchmarks/hash_table/static_set/insert_bench.cu
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::insert` performance
+ */
+template <typename Key, typename Dist>
+void static_set_insert(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               cuco::experimental::static_set<Key> set{
+                 size, cuco::empty_key<Key>{-1}, {}, {}, {}, launch.get_stream()};
+
+               timer.start();
+               set.insert(keys.begin(), keys.end(), launch.get_stream());
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_insert_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("static_set_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu
new file mode 100644
index 000000000..ded20fe04
--- /dev/null
+++ b/benchmarks/hash_table/static_set/size_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::size` performance
+ */
+template <typename Key, typename Dist>
+void static_set_size(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+
+  set.insert(keys.begin(), keys.end());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); });
+}
+
+NVBENCH_BENCH_TYPES(static_set_size,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_size_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9e02d62ba..d78627eee 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,6 +33,8 @@ endfunction(ConfigureExample)
 ### Example sources ###############################################################################
 ###################################################################################################
 
+ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
new file mode 100644
index 000000000..0179baa83
--- /dev/null
+++ b/examples/static_set/device_ref_example.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
+#include <iostream>
+
+// insert a set of keys into a hash set using one cooperative group for each task
+template <typename SetRef, typename InputIterator>
+__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
+{
+  namespace cg = cooperative_groups;
+
+  constexpr auto cg_size = SetRef::cg_size;
+
+  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
+  int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
+  int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  while (idx < n) {
+    set.insert(tile, *(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+template <typename SetRef, typename InputIterator, typename OutputIterator>
+__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found)
+{
+  int64_t const loop_stride = gridDim.x * blockDim.x;
+  int64_t idx               = blockDim.x * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    found[idx] = set.contains(*(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ *
+ */
+int main(void)
+{
+  using Key = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel = -1;
+
+  // Number of keys to be inserted
+  std::size_t constexpr num_keys = 50'000;
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  using set_type = cuco::experimental::static_set<Key>;
+
+  // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel.
+  set_type set{capacity, cuco::empty_key{empty_key_sentinel}};
+
+  // Create a sequence of keys {0, 1, 2, .., i}
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  // Insert the first half of the keys into the set
+  set.insert(keys.begin(), keys.begin() + num_keys / 2);
+
+  // Insert the second half of keys using a custom CUDA kernel.
+  custom_cooperative_insert<<<128, 128>>>(
+    set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2);
+
+  // Storage for result
+  thrust::device_vector<bool> found(num_keys);
+
+  // Check if all keys are now contained in the set. Note that we pass a reference that already has
+  // the `contains` operator.
+  // In general, using two or more reference objects to the same container but with
+  // a different set of operators concurrently is undefined behavior.
+  // This does not apply here since the two kernels do not overlap.
+  custom_contains<<<128, 128>>>(
+    set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin());
+
+  // Verify that all keys have been found
+  bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());
+
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/examples/static_set/host_bulk_example.cu b/examples/static_set/host_bulk_example.cu
new file mode 100644
index 000000000..3b8c4deb4
--- /dev/null
+++ b/examples/static_set/host_bulk_example.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <iostream>
+#include <limits>
+
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the static_set "bulk" host APIs.
+ *
+ * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or
+ * `contains` on a set of keys.
+ *
+ */
+int main(void)
+{
+  using Key = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel = -1;
+
+  // Number of keys to be inserted
+  std::size_t constexpr num_keys = 50'000;
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel.
+  cuco::experimental::static_set<Key> set{capacity, cuco::empty_key{empty_key_sentinel}};
+
+  // Create a sequence of keys {0, 1, 2, .., i}
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  // Inserts all keys into the hash set
+  set.insert(keys.begin(), keys.end());
+
+  // Storage for result
+  thrust::device_vector<bool> found(num_keys);
+
+  // Check if all keys are contained in the set
+  set.contains(keys.begin(), keys.end(), found.begin());
+
+  // Verify that all keys have been found
+  bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());
+
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh
new file mode 100644
index 000000000..1774e0bf3
--- /dev/null
+++ b/include/cuco/detail/equal_wrapper.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Enum of equality comparison results.
+ */
+enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 };
+
+/**
+ * @brief Equality wrapper.
+ *
+ * User-provided equality binary callable cannot be used to compare against sentinel value.
+ *
+ * @tparam T Right-hand side Element type
+ * @tparam Equal Type of user-provided equality binary callable
+ */
+template <typename T, typename Equal>
+struct equal_wrapper {
+  T sentinel_;   ///< Sentinel value
+  Equal equal_;  ///< Custom equality callable
+
+  /**
+   * @brief Equality wrapper ctor.
+   *
+   * @param sentinel Sentinel value
+   * @param equal Equality binary callable
+   */
+  __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal)
+    : sentinel_{sentinel}, equal_{equal}
+  {
+  }
+
+  /**
+   * @brief Equality check with the given equality callable.
+   *
+   * @tparam LHS Left-hand side Element type
+   * @tparam RHS Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   * @return Three way equality comparison result
+   */
+  template <typename LHS, typename RHS>
+  __device__ constexpr equal_result equal_to(LHS const& lhs, RHS const& rhs) const noexcept
+  {
+    return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL;
+  }
+
+  /**
+   * @brief Order-sensitive equality operator.
+   *
+   * This function always compares the left-hand side element against `sentinel_` value first
+   * then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`.
+   *
+   * @note Container (like set or map) keys MUST be always on the left-hand side.
+   *
+   * @tparam U Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   * @return Three way equality comparison result
+   */
+  template <typename U>
+  __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept
+  {
+    return cuco::detail::bitwise_compare(lhs, sentinel_) ? equal_result::EMPTY
+                                                         : this->equal_to(lhs, rhs);
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/operator.inl b/include/cuco/detail/operator.inl
new file mode 100644
index 000000000..fdd5884e8
--- /dev/null
+++ b/include/cuco/detail/operator.inl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/utility/traits.hpp>
+
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief CRTP mixin which augments a given `Reference` with an `Operator`.
+ *
+ * @throw If the operator is not defined in `include/cuco/operator.hpp`
+ *
+ * @tparam Operator Operator type, i.e., `cuco::op::*_tag`
+ * @tparam Reference The reference type.
+ *
+ * @note This primary template should never be instantiated.
+ */
+template <typename Operator, typename Reference>
+class operator_impl {
+  static_assert(cuco::dependent_false<Operator, Reference>,
+                "Operator type is not supported by reference type.");
+};
+
+/**
+ * @brief Checks if the given `Operator` is contained in a list of `Operators`.
+ *
+ * @tparam Operator Operator type, i.e., `cuco::op::*_tag`
+ * @tparam Operators List of operators to search in
+ *
+ * @return `true` if `Operator` is contained in `Operators`, `false` otherwise.
+ */
+template <typename Operator, typename... Operators>
+static constexpr bool has_operator()
+{
+  return ((std::is_same_v<Operators, Operator>) || ...);
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp
index 1180035ae..186a29257 100644
--- a/include/cuco/detail/prime.hpp
+++ b/include/cuco/detail/prime.hpp
@@ -21,6 +21,7 @@
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <limits>
 
 namespace cuco {
 namespace detail {
diff --git a/include/cuco/detail/probing_scheme_base.cuh b/include/cuco/detail/probing_scheme_base.cuh
new file mode 100644
index 000000000..03f712155
--- /dev/null
+++ b/include/cuco/detail/probing_scheme_base.cuh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Base class of public probing scheme.
+ *
+ * This class should not be used directly.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ */
+template <int32_t CGSize>
+class probing_scheme_base {
+ public:
+  /**
+   * @brief The size of the CUDA cooperative thread group.
+   */
+  static constexpr int32_t cg_size = CGSize;
+};
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl
new file mode 100644
index 000000000..4f7e904a1
--- /dev/null
+++ b/include/cuco/detail/probing_scheme_impl.inl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Probing iterator class.
+ *
+ * @tparam Extent Type of Extent
+ */
+template <typename Extent>
+class probing_iterator {
+ public:
+  using extent_type = Extent;                            ///< Extent type
+  using size_type   = typename extent_type::value_type;  ///< Size type
+
+  /**
+   * @brief Constructs an probing iterator
+   *
+   * @param start Iteration starting point
+   * @param step_size Double hashing step size
+   * @param upper_bound Upper bound of the iteration
+   */
+  __host__ __device__ constexpr probing_iterator(size_type start,
+                                                 size_type step_size,
+                                                 extent_type upper_bound) noexcept
+    : curr_index_{start}, step_size_{step_size}, upper_bound_{upper_bound}
+  {
+    // TODO: revise this API when introducing quadratic probing into cuco
+  }
+
+  /**
+   * @brief Dereference operator
+   *
+   * @return Current slot ndex
+   */
+  __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; }
+
+  /**
+   * @brief Prefix increment operator
+   *
+   * @return Current iterator
+   */
+  __host__ __device__ constexpr auto operator++() noexcept
+  {
+    // TODO: step_size_ can be a build time constant (e.g. linear probing)
+    //  Worth passing another extent type?
+    curr_index_ = (curr_index_ + step_size_) % upper_bound_;
+    return *this;
+  }
+
+  /**
+   * @brief Postfix increment operator
+   *
+   * @return Old iterator before increment
+   */
+  __host__ __device__ constexpr auto operator++(int32_t) noexcept
+  {
+    auto temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+ private:
+  size_type curr_index_;
+  size_type step_size_;
+  extent_type upper_bound_;
+};
+}  // namespace detail
+
+template <int32_t CGSize, typename Hash>
+__host__ __device__ constexpr linear_probing<CGSize, Hash>::linear_probing(Hash const& hash)
+  : hash_{hash}
+{
+}
+
+template <int32_t CGSize, typename Hash>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
+  ProbeKey const& probe_key, Extent upper_bound) const noexcept
+{
+  return detail::probing_iterator<Extent>{hash_(probe_key) % upper_bound,
+                                          1,  // step size is 1
+                                          upper_bound};
+}
+
+template <int32_t CGSize, typename Hash>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
+  cooperative_groups::thread_block_tile<cg_size> const& g,
+  ProbeKey const& probe_key,
+  Extent upper_bound) const noexcept
+{
+  return detail::probing_iterator<Extent>{
+    (hash_(probe_key) + g.thread_rank()) % upper_bound, cg_size, upper_bound};
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+__host__ __device__ constexpr double_hashing<CGSize, Hash1, Hash2>::double_hashing(
+  Hash1 const& hash1, Hash2 const& hash2)
+  : hash1_{hash1}, hash2_{hash2}
+{
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operator()(
+  ProbeKey const& probe_key, Extent upper_bound) const noexcept
+{
+  return detail::probing_iterator<Extent>{
+    hash1_(probe_key) % upper_bound,
+    hash2_(probe_key) % (upper_bound - 1) + 1,  // step size in range [1, prime - 1]
+    upper_bound};
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operator()(
+  cooperative_groups::thread_block_tile<cg_size> const& g,
+  ProbeKey const& probe_key,
+  Extent upper_bound) const noexcept
+{
+  return detail::probing_iterator<Extent>{
+    (hash1_(probe_key) + g.thread_rank()) % upper_bound,
+    (hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size,
+    upper_bound};
+}
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh
new file mode 100644
index 000000000..e7d52ae27
--- /dev/null
+++ b/include/cuco/detail/static_set/kernels.cuh
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/utils.hpp>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)` and returns the number of
+ * successful insertions.
+ *
+ * If multiple elements in `[first, first + size)` compare equal, it is unspecified which
+ * element is inserted.
+ *
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param num_successes Number of successful inserted elements
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t BlockSize, typename InputIterator, typename AtomicT, typename Ref>
+__global__ void insert(InputIterator first,
+                       cuco::detail::index_type n,
+                       AtomicT* num_successes,
+                       Ref ref)
+{
+  using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  typename Ref::size_type thread_num_successes = 0;
+
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
+  cuco::detail::index_type idx               = BlockSize * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    typename Ref::value_type const insert_pair{*(first + idx)};
+    if (ref.insert(insert_pair)) { thread_num_successes++; };
+    idx += loop_stride;
+  }
+
+  // compute number of successfully inserted elements for each block
+  // and atomically add to the grand total
+  typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)`.
+ *
+ * If multiple elements in `[first, first + n)` compare equal, it is unspecified which
+ * element is inserted.
+ *
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t BlockSize, typename InputIterator, typename Ref>
+__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref)
+{
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
+  cuco::detail::index_type idx               = BlockSize * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    typename Ref::value_type const insert_pair{*(first + idx)};
+    ref.insert(insert_pair);
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)` and returns the number of
+ * successful insertions.
+ *
+ * If multiple elements in `[first, first + n)` compare equal, it is unspecified which
+ * element is inserted.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param num_successes Number of successful inserted elements
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIterator, typename AtomicT, typename Ref>
+__global__ void insert(InputIterator first,
+                       cuco::detail::index_type n,
+                       AtomicT* num_successes,
+                       Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  typename Ref::size_type thread_num_successes = 0;
+
+  auto const tile                            = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
+  cuco::detail::index_type idx               = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
+
+  while (idx < n) {
+    typename Ref::value_type const insert_pair{*(first + idx)};
+    if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; };
+    idx += loop_stride;
+  }
+
+  // compute number of successfully inserted elements for each block
+  // and atomically add to the grand total
+  typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)`.
+ *
+ * If multiple elements in `[first, first + n)` compare equal, it is unspecified which
+ * element is inserted.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIterator, typename Ref>
+__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto tile                                  = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
+  cuco::detail::index_type idx               = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
+
+  while (idx < n) {
+    typename Ref::value_type const insert_pair{*(first + idx)};
+    ref.insert(tile, insert_pair);
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
+ * structure.
+ *
+ * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data
+ * structure.
+ *
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator assignable from `bool`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
+__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block      = cg::this_thread_block();
+  auto const thread_idx = block.thread_rank();
+
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
+  cuco::detail::index_type idx               = BlockSize * blockIdx.x + threadIdx.x;
+  __shared__ bool output_buffer[BlockSize];
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if (idx < n) {
+      auto const key = *(first + idx);
+      /*
+       * The ld.relaxed.gpu instruction used in this operation causes L1 to
+       * flush more frequently, causing increased sector stores from L2 to global memory.
+       * By writing results to shared memory and then synchronizing before writing back
+       * to global, we no longer rely on L1, preventing the increase in sector stores from
+       * L2 to global and improving performance.
+       */
+      output_buffer[thread_idx] = ref.contains(key);
+    }
+
+    block.sync();
+    if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; }
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
+ * structure.
+ *
+ * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data
+ * structure.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator assignable from `bool`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param ref Non-owing set device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
+__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto block            = cg::this_thread_block();
+  auto const thread_idx = block.thread_rank();
+
+  auto tile                                  = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
+  cuco::detail::index_type idx               = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
+
+  __shared__ bool output_buffer[BlockSize / CGSize];
+  auto const tile_idx = thread_idx / CGSize;
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if (idx < n) {
+      auto const key   = *(first + idx);
+      auto const found = ref.contains(tile, key);
+      /*
+       * The ld.relaxed.gpu instruction used in view.find causes L1 to
+       * flush more frequently, causing increased sector stores from L2 to global memory.
+       * By writing results to shared memory and then synchronizing before writing back
+       * to global, we no longer rely on L1, preventing the increase in sector stores from
+       * L2 to global and improving performance.
+       */
+      if (tile.thread_rank() == 0) { output_buffer[tile_idx] = found; }
+    }
+
+    block.sync();
+    if (idx < n and tile.thread_rank() == 0) { *(output_begin + idx) = output_buffer[tile_idx]; }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
new file mode 100644
index 000000000..0443c1184
--- /dev/null
+++ b/include/cuco/detail/static_set/static_set.inl
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/prime.hpp>
+#include <cuco/detail/static_set/kernels.cuh>
+#include <cuco/detail/storage/counter_storage.cuh>
+#include <cuco/detail/tuning.cuh>
+#include <cuco/detail/utils.hpp>
+#include <cuco/operator.hpp>
+#include <cuco/static_set_ref.cuh>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_reduce.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::static_set(
+  Extent capacity,
+  empty_key<Key> empty_key_sentinel,
+  KeyEqual pred,
+  ProbingScheme const& probing_scheme,
+  Allocator const& alloc,
+  cudaStream_t stream)
+  : empty_key_sentinel_{empty_key_sentinel},
+    predicate_{pred},
+    probing_scheme_{probing_scheme},
+    allocator_{alloc},
+    storage_{make_valid_extent<cg_size, window_size>(capacity), allocator_}
+{
+  storage_.initialize(empty_key_sentinel_, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
+  InputIt first, InputIt last, cudaStream_t stream)
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return 0; }
+
+  auto counter = detail::counter_storage<size_type, thread_scope, allocator_type>{allocator_};
+  counter.reset(stream);
+
+  auto const grid_size =
+    (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
+    (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
+
+  if constexpr (cg_size == 1) {
+    detail::insert<detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
+        first, num_keys, counter.data(), ref(op::insert));
+  } else {
+    detail::insert<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
+        first, num_keys, counter.data(), ref(op::insert));
+  }
+
+  return counter.load_to_host(stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
+  InputIt first, InputIt last, cudaStream_t stream)
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size =
+    (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
+    (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
+
+  if constexpr (cg_size == 1) {
+    detail::insert_async<detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(first, num_keys, ref(op::insert));
+  } else {
+    detail::insert_async<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(first, num_keys, ref(op::insert));
+  }
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
+  InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const
+{
+  contains_async(first, last, output_begin, stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_async(
+  InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size =
+    (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
+    (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
+
+  if constexpr (cg_size == 1) {
+    detail::contains<detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
+        first, num_keys, output_begin, ref(op::contains));
+  } else {
+    detail::contains<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
+      <<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
+        first, num_keys, output_begin, ref(op::contains));
+  }
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+  cudaStream_t stream) const
+{
+  auto const begin = thrust::make_transform_iterator(
+    storage_.data(),
+    cuco::detail::elements_per_window<typename storage_type::value_type>{empty_key_sentinel_});
+
+  std::size_t temp_storage_bytes = 0;
+  using temp_allocator_type = typename std::allocator_traits<allocator_type>::rebind_alloc<char>;
+  auto temp_allocator       = temp_allocator_type{allocator_};
+  auto d_size               = reinterpret_cast<size_type*>(
+    std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
+  cub::DeviceReduce::Sum(
+    nullptr, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream);
+
+  auto d_temp_storage =
+    std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, temp_storage_bytes);
+
+  cub::DeviceReduce::Sum(
+    d_temp_storage, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream);
+
+  size_type h_size;
+  CUCO_CUDA_TRY(
+    cudaMemcpyAsync(&h_size, d_size, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, reinterpret_cast<char*>(d_size), sizeof(size_type));
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, d_temp_storage, temp_storage_bytes);
+
+  return h_size;
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr auto
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::capacity()
+  const noexcept
+{
+  return storage_.capacity();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::empty_key_sentinel()
+  const noexcept
+{
+  return empty_key_sentinel_;
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename... Operators>
+auto static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::ref(
+  Operators...) const noexcept
+{
+  static_assert(sizeof...(Operators), "No operators specified");
+  return ref_type<Operators...>{
+    cuco::empty_key<key_type>(empty_key_sentinel_), predicate_, probing_scheme_, storage_.ref()};
+}
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl
new file mode 100644
index 000000000..51099243f
--- /dev/null
+++ b/include/cuco/detail/static_set/static_set_ref.inl
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/equal_wrapper.cuh>
+#include <cuco/detail/pair.cuh>
+#include <cuco/operator.hpp>
+#include <cuco/sentinel.cuh>
+
+#include <thrust/distance.h>
+
+#include <cooperative_groups.h>
+#include <cuda/std/atomic>
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_set_ref<
+  Key,
+  Scope,
+  KeyEqual,
+  ProbingScheme,
+  StorageRef,
+  Operators...>::static_set_ref(cuco::empty_key<Key> empty_key_sentinel,
+                                KeyEqual const& predicate,
+                                ProbingScheme const& probing_scheme,
+                                StorageRef storage_ref) noexcept
+  : empty_key_sentinel_{empty_key_sentinel},
+    predicate_{empty_key_sentinel, predicate},
+    probing_scheme_{probing_scheme},
+    storage_ref_{storage_ref}
+{
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr auto
+static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::capacity()
+  const noexcept
+{
+  return storage_ref_.capacity();
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr Key
+static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::empty_key_sentinel()
+  const noexcept
+{
+  return empty_key_sentinel_;
+}
+
+namespace detail {
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::insert_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size      = base_type::cg_size;
+  static constexpr auto window_size  = base_type::window_size;
+  static constexpr auto thread_scope = base_type::thread_scope;
+
+ public:
+  /**
+   * @brief Inserts an element.
+   *
+   * @param value The element to insert
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(value_type const& value) noexcept
+  {
+    ref_type& ref_    = static_cast<ref_type&>(*this);
+    auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows());
+
+    while (true) {
+      auto const window_slots = ref_.storage_ref_[*probing_iter];
+
+      // TODO: perf gain with #pragma unroll since num_windows is build time constant
+      for (auto& slot_content : window_slots) {
+        auto const eq_res = ref_.predicate_(slot_content, value);
+
+        // If the key is already in the container, return false
+        if (eq_res == detail::equal_result::EQUAL) { return false; }
+        if (eq_res == detail::equal_result::EMPTY) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          switch (attempt_insert(
+            (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value)) {
+            case insert_result::CONTINUE: continue;
+            case insert_result::SUCCESS: return true;
+            case insert_result::DUPLICATE: return false;
+          }
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> group,
+                         value_type const& value) noexcept
+  {
+    auto& ref_        = static_cast<ref_type&>(*this);
+    auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows());
+
+    while (true) {
+      auto const window_slots = ref_.storage_ref_[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (ref_.predicate_(window_slots[i], value)) {
+            case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i};
+            default: continue;
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return cuco::pair<detail::equal_result, int32_t>{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      // If the key is already in the container, return false
+      if (group.any(state == detail::equal_result::EQUAL)) { return false; }
+
+      auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY);
+
+      if (group_contains_empty) {
+        auto const src_lane = __ffs(group_contains_empty) - 1;
+        auto const status =
+          (group.thread_rank() == src_lane)
+            ? attempt_insert(
+                (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value)
+            : insert_result::CONTINUE;
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: return true;
+          case insert_result::DUPLICATE: return false;
+          default: continue;
+        }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+ private:
+  // TODO: this should be a common enum for all data structures
+  enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 };
+
+  /**
+   * @brief Attempts to insert an element into a slot.
+   *
+   * @note Dispatches the correct implementation depending on the container
+   * type and presence of other operator mixins.
+   *
+   * @param slot Pointer to the slot in memory
+   * @param value Element to insert
+   *
+   * @return Result of this operation, i.e., success/continue/duplicate
+   */
+  [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value)
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+
+    // temporary workaround due to performance regression
+    // https://github.com/NVIDIA/libcudacxx/issues/366
+    value_type const old = [&]() {
+      value_type expected = ref_.empty_key_sentinel_.value;
+      value_type val      = value;
+      if constexpr (sizeof(value_type) == sizeof(uint32_t)) {
+        auto* expected_ptr = reinterpret_cast<unsigned int*>(&expected);
+        auto* value_ptr    = reinterpret_cast<unsigned int*>(&val);
+        if constexpr (thread_scope == cuda::thread_scope_system) {
+          return atomicCAS_system(reinterpret_cast<unsigned int*>(slot), *expected_ptr, *value_ptr);
+        } else if constexpr (thread_scope == cuda::thread_scope_device) {
+          return atomicCAS(reinterpret_cast<unsigned int*>(slot), *expected_ptr, *value_ptr);
+        } else if constexpr (thread_scope == cuda::thread_scope_block) {
+          return atomicCAS_block(reinterpret_cast<unsigned int*>(slot), *expected_ptr, *value_ptr);
+        } else {
+          static_assert(cuco::dependent_false<decltype(thread_scope)>, "Unsupported thread scope");
+        }
+      }
+      if constexpr (sizeof(value_type) == sizeof(uint64_t)) {
+        auto* expected_ptr = reinterpret_cast<unsigned long long int*>(&expected);
+        auto* value_ptr    = reinterpret_cast<unsigned long long int*>(&val);
+        if constexpr (thread_scope == cuda::thread_scope_system) {
+          return atomicCAS_system(
+            reinterpret_cast<unsigned long long int*>(slot), *expected_ptr, *value_ptr);
+        } else if constexpr (thread_scope == cuda::thread_scope_device) {
+          return atomicCAS(
+            reinterpret_cast<unsigned long long int*>(slot), *expected_ptr, *value_ptr);
+        } else if constexpr (thread_scope == cuda::thread_scope_block) {
+          return atomicCAS_block(
+            reinterpret_cast<unsigned long long int*>(slot), *expected_ptr, *value_ptr);
+        } else {
+          static_assert(cuco::dependent_false<decltype(thread_scope)>, "Unsupported thread scope");
+        }
+      }
+    }();
+    if (*slot == old) {
+      // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare
+      return ref_.predicate_.equal_to(old, value) == detail::equal_result::EQUAL
+               ? insert_result::DUPLICATE
+               : insert_result::CONTINUE;
+    } else {
+      return insert_result::SUCCESS;
+    }
+  }
+};
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::contains_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * If the probe key `key` was inserted into the container, returns
+   * true. Otherwise, returns false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param key The key to search for
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+
+    auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows());
+
+    while (true) {
+      // TODO atomic_ref::load if insert operator is present
+      auto const window_slots = ref_.storage_ref_[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        switch (ref_.predicate_(slot_content, key)) {
+          case detail::equal_result::UNEQUAL: continue;
+          case detail::equal_result::EMPTY: return false;
+          case detail::equal_result::EQUAL: return true;
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * If the probe key `key` was inserted into the container, returns
+   * true. Otherwise, returns false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param g The Cooperative Group used to perform group contains
+   * @param key The key to search for
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(cooperative_groups::thread_block_tile<cg_size> const& g,
+                                         ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+
+    auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows());
+
+    while (true) {
+      auto const window_slots = ref_.storage_ref_[*probing_iter];
+
+      auto const state = [&]() {
+        for (auto& slot : window_slots) {
+          switch (ref_.predicate_(slot, key)) {
+            case detail::equal_result::EMPTY: return detail::equal_result::EMPTY;
+            case detail::equal_result::EQUAL: return detail::equal_result::EQUAL;
+            default: continue;
+          }
+        }
+        return detail::equal_result::UNEQUAL;
+      }();
+
+      if (g.any(state == detail::equal_result::EQUAL)) { return true; }
+      if (g.any(state == detail::equal_result::EMPTY)) { return false; }
+
+      ++probing_iter;
+    }
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh
new file mode 100644
index 000000000..316f7fbe5
--- /dev/null
+++ b/include/cuco/detail/storage/aow_storage.cuh
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/storage/kernels.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/detail/tuning.cuh>
+#include <cuco/extent.cuh>
+
+#include <cuda/std/array>
+
+#include <cstddef>
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Base class of array of slot windows open addressing storage.
+ *
+ * This should NOT be used directly.
+ *
+ * @tparam WindowSize Number of elements in each window
+ * @tparam T Element type
+ * @tparam Extent Type of extent denoting the number of windows
+ */
+template <int32_t WindowSize, typename T, typename Extent>
+class aow_storage_base : public storage_base<Extent> {
+ public:
+  /**
+   * @brief The number of elements (slots) processed per window.
+   */
+  static constexpr int32_t window_size = WindowSize;
+
+  using extent_type = typename storage_base<Extent>::extent_type;  ///< Storage extent type
+  using size_type   = typename storage_base<Extent>::size_type;    ///< Storage size type
+
+  using value_type  = T;                                          ///< Slot type
+  using window_type = cuda::std::array<value_type, window_size>;  ///< Slot window type
+
+  /**
+   * @brief Constructor of AoW base storage.
+   *
+   * @param size Number of windows to store
+   */
+  explicit constexpr aow_storage_base(Extent size) : storage_base<Extent>{size} {}
+
+  /**
+   * @brief Gets the total number of slot windows in the current storage.
+   *
+   * @return The total number of slot windows
+   */
+  [[nodiscard]] __host__ __device__ constexpr extent_type num_windows() const noexcept
+  {
+    return storage_base<Extent>::capacity();
+  }
+
+  /**
+   * @brief Gets the total number of slots in the current storage.
+   *
+   * @return The total number of slots
+   */
+  [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept
+  {
+    return storage_base<Extent>::capacity().template multiply<window_size>();
+  }
+};
+
+/**
+ * @brief Non-owning AoW storage reference type.
+ *
+ * @tparam WindowSize Number of slots in each window
+ * @tparam T Storage element type
+ * @tparam Extent Type of extent denoting storage capacity
+ */
+template <int32_t WindowSize, typename T, typename Extent>
+class aow_storage_ref : public aow_storage_base<WindowSize, T, Extent> {
+ public:
+  using base_type = aow_storage_base<WindowSize, T, Extent>;  ///< AoW base class type
+
+  using base_type::window_size;  ///< Number of elements processed per window
+
+  using extent_type = typename base_type::extent_type;  ///< Storage extent type
+  using size_type   = typename base_type::size_type;    ///< Storage size type
+  using value_type  = typename base_type::value_type;   ///< Slot type
+  using window_type = typename base_type::window_type;  ///< Slot window type
+
+  using base_type::capacity;
+  using base_type::num_windows;
+
+  /**
+   * @brief Constructor of AoS storage ref.
+   *
+   * @param windows Pointer to the windows array
+   * @param num_windows Number of windows
+   */
+  explicit constexpr aow_storage_ref(Extent num_windows, window_type* windows) noexcept
+    : aow_storage_base<WindowSize, T, Extent>{num_windows}, windows_{windows}
+  {
+  }
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] __device__ constexpr window_type* data() noexcept { return windows_; }
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] __device__ constexpr window_type* data() const noexcept { return windows_; }
+
+  /**
+   * @brief Returns an array of slots (or a window) for a given index.
+   *
+   * @param index Index of the window
+   * @return An array of slots
+   */
+  [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept
+  {
+    return *reinterpret_cast<window_type*>(
+      __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size));
+  }
+
+ private:
+  window_type* windows_;  ///< Pointer to the windows array
+};
+
+/**
+ * @brief Array of slot Window open addressing storage class.
+ *
+ * @tparam WindowSize Number of slots in each window
+ * @tparam T Slot type
+ * @tparam Extent Type of extent denoting number of windows
+ * @tparam Allocator Type of allocator used for device storage (de)allocation
+ */
+template <int32_t WindowSize, typename T, typename Extent, typename Allocator>
+class aow_storage : public aow_storage_base<WindowSize, T, Extent> {
+ public:
+  using base_type = aow_storage_base<WindowSize, T, Extent>;  ///< AoW base class type
+
+  using base_type::window_size;  ///< Number of elements processed per window
+
+  using extent_type = typename base_type::extent_type;  ///< Storage extent type
+  using size_type   = typename base_type::size_type;    ///< Storage size type
+  using value_type  = typename base_type::value_type;   ///< Slot type
+  using window_type = typename base_type::window_type;  ///< Slot window type
+
+  using base_type::capacity;
+  using base_type::num_windows;
+
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::rebind_alloc<window_type>;  ///< Type of the
+                                                                           ///< allocator to
+                                                                           ///< (de)allocate windows
+  using window_deleter_type = custom_deleter<allocator_type>;  ///< Type of window deleter
+  using ref_type = aow_storage_ref<window_size, value_type, extent_type>;  ///< Storage ref type
+
+  /**
+   * @brief Constructor of AoW storage.
+   *
+   * @note The input `size` should be exclusively determined by the return value of
+   * `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and
+   * the storage.
+   *
+   * @param size Number of windows to (de)allocate
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr aow_storage(Extent size, Allocator const& allocator)
+    : aow_storage_base<WindowSize, T, Extent>{size},
+      allocator_{allocator},
+      window_deleter_{capacity(), allocator_},
+      windows_{allocator_.allocate(capacity()), window_deleter_}
+  {
+  }
+
+  aow_storage(aow_storage&&) = default;  ///< Move constructor
+  /**
+   * @brief Replaces the contents of the storage with another storage.
+   *
+   * @return Reference of the current storage object
+   */
+  aow_storage& operator=(aow_storage&&) = default;
+  ~aow_storage()                        = default;  ///< Destructor
+
+  aow_storage(aow_storage const&) = delete;
+  aow_storage& operator=(aow_storage const&) = delete;
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] constexpr window_type* data() const noexcept { return windows_.get(); }
+
+  /**
+   * @brief Gets window storage reference.
+   *
+   * @return Reference of window storage
+   */
+  [[nodiscard]] constexpr ref_type ref() const noexcept
+  {
+    return ref_type{this->num_windows(), this->data()};
+  }
+
+  /**
+   * @brief Initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize(value_type key, cudaStream_t stream) noexcept
+  {
+    auto constexpr stride = 4;
+    auto const grid_size  = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
+                           (stride * detail::CUCO_DEFAULT_BLOCK_SIZE);
+
+    detail::initialize<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
+      this->data(), this->num_windows(), key);
+  }
+
+ private:
+  allocator_type allocator_;            ///< Allocator used to (de)allocate windows
+  window_deleter_type window_deleter_;  ///< Custom windows deleter
+  std::unique_ptr<window_type, window_deleter_type> windows_;  ///< Pointer to AoW storage
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh
new file mode 100644
index 000000000..bf87357a3
--- /dev/null
+++ b/include/cuco/detail/storage/counter_storage.cuh
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/extent.cuh>
+
+#include <cuda/atomic>
+
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Device atomic counter storage class.
+ *
+ * @tparam SizeType Type of storage size
+ * @tparam Scope The scope in which the counter will be used by individual threads
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename SizeType, cuda::thread_scope Scope, typename Allocator>
+class counter_storage : public storage_base<cuco::experimental::extent<SizeType, 1>> {
+ public:
+  using storage_base<cuco::experimental::extent<SizeType, 1>>::capacity_;  ///< Storage size
+
+  using size_type      = SizeType;                        ///< Size type
+  using value_type     = cuda::atomic<size_type, Scope>;  ///< Type of the counter
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
+    value_type>;  ///< Type of the allocator to (de)allocate counter
+  using counter_deleter_type = custom_deleter<allocator_type>;  ///< Type of counter deleter
+
+  /**
+   * @brief Constructor of counter storage.
+   *
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr counter_storage(Allocator const& allocator)
+    : storage_base<cuco::experimental::extent<SizeType, 1>>{cuco::experimental::extent<size_type,
+                                                                                       1>{}},
+      allocator_{allocator},
+      counter_deleter_{capacity_, allocator_},
+      counter_{allocator_.allocate(capacity_), counter_deleter_}
+  {
+  }
+
+  /**
+   * @brief Asynchronously resets counter to zero.
+   *
+   * @param stream CUDA stream used to reset
+   */
+  void reset(cudaStream_t stream)
+  {
+    static_assert(sizeof(size_type) == sizeof(value_type));
+    CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream));
+  }
+
+  /**
+   * @brief Gets device atomic counter pointer.
+   *
+   * @return Pointer to the device atomic counter
+   */
+  [[nodiscard]] constexpr value_type* data() noexcept { return counter_.get(); }
+
+  /**
+   * @brief Gets device atomic counter pointer.
+   *
+   * @return Pointer to the device atomic counter
+   */
+  [[nodiscard]] constexpr value_type* data() const noexcept { return counter_.get(); }
+
+  /**
+   * @brief Atomically obtains the value of the device atomic counter and copies it to the host.
+   *
+   * @note This API synchronizes the given `stream`.
+   *
+   * @param stream CUDA stream used to copy device value to the host
+   * @return Value of the atomic counter
+   */
+  [[nodiscard]] constexpr size_type load_to_host(cudaStream_t stream) const
+  {
+    size_type h_count;
+    CUCO_CUDA_TRY(
+      cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+    CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+    return h_count;
+  }
+
+ private:
+  allocator_type allocator_;              ///< Allocator used to (de)allocate counter
+  counter_deleter_type counter_deleter_;  ///< Custom counter deleter
+  std::unique_ptr<value_type, counter_deleter_type> counter_;  ///< Pointer to counter storage
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh
new file mode 100644
index 000000000..546c58daa
--- /dev/null
+++ b/include/cuco/detail/storage/kernels.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/utils.hpp>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Initializes each slot in the window storage to contain `value`.
+ *
+ * @tparam WindowT Window type
+ *
+ * @param windows Pointer to flat storage for windows
+ * @param n Number of input windows
+ * @param value Value to which all values in `slots` are initialized
+ */
+template <typename WindowT>
+__global__ void initialize(WindowT* windows,
+                           cuco::detail::index_type n,
+                           typename WindowT::value_type value)
+{
+  cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x;
+  cuco::detail::index_type idx               = blockDim.x * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    auto& window_slots = *(windows + idx);
+#pragma unroll
+    for (auto& slot : window_slots) {
+      slot = value;
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh
new file mode 100644
index 000000000..b4fc86890
--- /dev/null
+++ b/include/cuco/detail/storage/storage.cuh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/storage/aow_storage.cuh>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Intermediate class internally used by data structures
+ *
+ * @tparam StorageImpl Storage implementation class
+ * @tparam T Storage element type
+ * @tparam Extent Type of extent denoting number of windows
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <class StorageImpl, class T, class Extent, class Allocator>
+class storage : StorageImpl::template impl<T, Extent, Allocator> {
+ public:
+  /// Storage implementation type
+  using impl_type  = typename StorageImpl::template impl<T, Extent, Allocator>;
+  using ref_type   = typename impl_type::ref_type;    ///< Storage ref type
+  using value_type = typename impl_type::value_type;  ///< Storage value type
+
+  /// Number of elements per window
+  static constexpr int window_size = impl_type::window_size;
+
+  using impl_type::capacity;
+  using impl_type::data;
+  using impl_type::initialize;
+  using impl_type::num_windows;
+  using impl_type::ref;
+
+  /**
+   * @brief Constructs storage.
+   *
+   * @param size Number of slots to (de)allocate
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr storage(Extent size, Allocator const& allocator) : impl_type{size, allocator}
+  {
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh
new file mode 100644
index 000000000..dec443dce
--- /dev/null
+++ b/include/cuco/detail/storage/storage_base.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Custom deleter for unique pointer.
+ *
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename Allocator>
+struct custom_deleter {
+  using pointer = typename Allocator::value_type*;  ///< Value pointer type
+
+  /**
+   * @brief Constructor of custom deleter.
+   *
+   * @param size Number of values to deallocate
+   * @param allocator Allocator used for deallocating device storage
+   */
+  explicit constexpr custom_deleter(std::size_t size, Allocator& allocator)
+    : size_{size}, allocator_{allocator}
+  {
+  }
+
+  /**
+   * @brief Operator for deallocation
+   *
+   * @param ptr Pointer to the first value for deallocation
+   */
+  void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); }
+
+  std::size_t size_;      ///< Number of values to delete
+  Allocator& allocator_;  ///< Allocator used deallocating values
+};
+
+/**
+ * @brief Base class of open addressing storage.
+ *
+ * This class should not be used directly.
+ *
+ * @tparam Extent Type of extent denoting storage capacity
+ */
+template <typename Extent>
+class storage_base {
+ public:
+  using extent_type = Extent;                            ///< Storage extent type
+  using size_type   = typename extent_type::value_type;  ///< Storage size type
+
+  /**
+   * @brief Constructor of base storage.
+   *
+   * @param size Number of elements to (de)allocate
+   */
+  explicit constexpr storage_base(Extent size) : capacity_{size} {}
+
+  /**
+   * @brief Gets the total number of elements in the current storage.
+   *
+   * @return The total number of elements
+   */
+  [[nodiscard]] __host__ __device__ constexpr extent_type capacity() const noexcept
+  {
+    return capacity_;
+  }
+
+ protected:
+  extent_type capacity_;  ///< Total number of elements
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/tuning.cuh b/include/cuco/detail/tuning.cuh
new file mode 100644
index 000000000..035b60cc5
--- /dev/null
+++ b/include/cuco/detail/tuning.cuh
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+static constexpr int CUCO_DEFAULT_BLOCK_SIZE = 128;
+static constexpr int CUCO_DEFAULT_STRIDE     = 1;
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index 3aadbb848..ae55f7830 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
 
 #pragma once
 
+#include <cuco/detail/bitwise_compare.cuh>
+
 #include <thrust/tuple.h>
 
 namespace cuco {
@@ -59,7 +61,7 @@ struct slot_to_tuple {
  */
 template <typename Key>
 struct slot_is_filled {
-  Key empty_key_sentinel;  ///< The value of the empty key sentinel
+  Key empty_key_sentinel_;  ///< The value of the empty key sentinel
 
   /**
    * @brief Indicates if the target slot `s` is filled.
@@ -72,7 +74,35 @@ struct slot_is_filled {
   template <typename S>
   __device__ bool operator()(S const& s)
   {
-    return thrust::get<0>(s) != empty_key_sentinel;
+    return not cuco::detail::bitwise_compare(thrust::get<0>(s), empty_key_sentinel_);
+  }
+};
+
+/**
+ * @brief Device functor returning the number of filled elements per window.
+ *
+ * @tparam Sentinel Empty sentinel type
+ */
+template <typename Sentinel>
+struct elements_per_window {
+  Sentinel empty_key_sentinel_;  ///< The value of the empty key sentinel
+
+  /**
+   * @brief Computes the number of filled elements per window.
+   *
+   * @tparam Window Window storage type
+   *
+   * @param window The window to query
+   * @return Number of filled elements per window
+   */
+  template <typename Window>
+  __device__ inline int32_t operator()(Window const& window) const
+  {
+    int32_t num = 0;
+    for (auto const& element : window) {
+      num += not cuco::detail::bitwise_compare(element, empty_key_sentinel_);
+    }
+    return num;
   }
 };
 
diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp
index d06216c54..513ccd559 100644
--- a/include/cuco/detail/utils.hpp
+++ b/include/cuco/detail/utils.hpp
@@ -23,6 +23,8 @@
 namespace cuco {
 namespace detail {
 
+using index_type = int64_t;  ///< index type for internal use
+
 /**
  * @brief Compute the number of bits of a simple type.
  *
@@ -56,13 +58,50 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm
 }
 
 template <typename Iterator>
-constexpr inline int64_t distance(Iterator begin, Iterator end)
+constexpr inline index_type distance(Iterator begin, Iterator end)
 {
   using category = typename std::iterator_traits<Iterator>::iterator_category;
   static_assert(std::is_base_of_v<std::random_access_iterator_tag, category>,
                 "Input iterator should be a random access iterator.");
   // `int64_t` instead of arch-dependant `long int`
-  return static_cast<int64_t>(std::distance(begin, end));
+  return static_cast<index_type>(std::distance(begin, end));
+}
+
+/**
+ * @brief C++17 constexpr backport of `std::lower_bound`.
+ *
+ * @tparam ForwardIt Type of input iterator
+ * @tparam T Type of `value`
+ *
+ * @param first Iterator defining the start of the range to examine
+ * @param last Iterator defining the start of the range to examine
+ * @param value Value to compare the elements to
+ *
+ * @return Iterator pointing to the first element in the range [first, last) that does not satisfy
+ * element < value
+ */
+template <class ForwardIt, class T>
+constexpr ForwardIt lower_bound(ForwardIt first, ForwardIt last, const T& value)
+{
+  using diff_type = typename std::iterator_traits<ForwardIt>::difference_type;
+
+  ForwardIt it{};
+  diff_type count = std::distance(first, last);
+  diff_type step{};
+
+  while (count > 0) {
+    it   = first;
+    step = count / 2;
+    std::advance(it, step);
+
+    if (static_cast<T>(*it) < value) {
+      first = ++it;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
 }
 
 }  // namespace detail
diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh
new file mode 100644
index 000000000..b825188ed
--- /dev/null
+++ b/include/cuco/extent.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/prime.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cuco {
+namespace experimental {
+static constexpr std::size_t dynamic_extent = static_cast<std::size_t>(-1);
+
+/**
+ * @brief Static extent class.
+ *
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ */
+template <typename SizeType, std::size_t N = dynamic_extent>
+struct extent {
+  using value_type = SizeType;  ///< Extent value type
+
+  constexpr extent() = default;
+
+  /// Constructs from `SizeType`
+  __host__ __device__ constexpr explicit extent(SizeType) noexcept {}
+
+  /**
+   * @brief Conversion to value_type.
+   *
+   * @return Extent size
+   */
+  __host__ __device__ constexpr operator value_type() const noexcept { return N; }
+
+  /**
+   * @brief Multiplies the current extent with the given `Value`.
+   *
+   * @tparam Value The input value to multiply with
+   *
+   * @return Resulting static extent
+   */
+  template <int32_t Value>
+  __host__ __device__ constexpr auto multiply() const noexcept
+  {
+    return extent<value_type, N * Value>{};
+  }
+};
+
+/**
+ * @brief Dynamic extent class.
+ *
+ * @tparam SizeType Size type
+ */
+template <typename SizeType>
+struct extent<SizeType, dynamic_extent> {
+  using value_type = SizeType;  ///< Extent value type
+
+  /**
+   * @brief Constructs extent from a given `size`.
+   *
+   * @param size The extent size
+   */
+  __host__ __device__ constexpr extent(SizeType size) noexcept : value_{size} {}
+
+  /**
+   * @brief Conversion to value_type.
+   *
+   * @return Extent size
+   */
+  __host__ __device__ constexpr operator value_type() const noexcept { return value_; }
+
+  /**
+   * @brief Multiplies the current extent with the given `Value`.
+   *
+   * @tparam Value The input value to multiply with
+   *
+   * @return Resulting extent
+   */
+  template <int32_t Value>
+  __host__ __device__ constexpr auto multiply() const noexcept
+  {
+    return extent<value_type>{Value * value_};
+  }
+
+ private:
+  value_type value_;  ///< Extent value
+};
+
+/**
+ * @brief Computes valid extent based on given parameters.
+ *
+ * @note The actual capacity of a container (map/set) should be exclusively determined by the return
+ * value of this utility since the output depends on the requested low-bound size, the probing
+ * scheme, and the storage. This utility is used internally during container constructions while for
+ * container ref constructions, it would be users' responsibility to use this function to determine
+ * the input size of the ref.
+ *
+ * @tparam CGSize Number of elements handled per CG
+ * @tparam WindowSize Number of elements handled per Window
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ *
+ * @throw If the input extent is invalid
+ *
+ * @return Resulting valid extent
+ */
+template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
+[[nodiscard]] auto constexpr make_valid_extent(extent<SizeType, N> ext)
+{
+  auto constexpr max_prime = cuco::detail::primes.back();
+  auto constexpr max_value =
+    (static_cast<uint64_t>(std::numeric_limits<SizeType>::max()) < max_prime)
+      ? std::numeric_limits<SizeType>::max()
+      : static_cast<SizeType>(max_prime);
+  auto const size = SDIV(ext, CGSize * WindowSize);
+  if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); }
+
+  if constexpr (N == dynamic_extent) {
+    return extent<SizeType>{static_cast<SizeType>(
+      *cuco::detail::lower_bound(
+        cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
+      CGSize)};
+  }
+  if constexpr (N != dynamic_extent) {
+    return extent<SizeType,
+                  static_cast<std::size_t>(*cuco::detail::lower_bound(cuco::detail::primes.begin(),
+                                                                      cuco::detail::primes.end(),
+                                                                      static_cast<uint64_t>(size)) *
+                                           CGSize)>{};
+  }
+}
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
new file mode 100644
index 000000000..abf66e0bd
--- /dev/null
+++ b/include/cuco/operator.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+inline namespace op {
+// TODO enum class of int32_t instead of struct
+// https://github.com/NVIDIA/cuCollections/issues/239
+/**
+ * @brief `insert` operator tag
+ */
+struct insert_tag {
+} inline constexpr insert;
+
+/**
+ * @brief `contains` operator tag
+ */
+struct contains_tag {
+} inline constexpr contains;
+
+}  // namespace op
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/operator.inl>
diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh
new file mode 100644
index 000000000..0880ee97b
--- /dev/null
+++ b/include/cuco/probing_scheme.cuh
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/probing_scheme_base.cuh>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+/**
+ * @brief Public linear probing scheme class.
+ *
+ * @note Linear probing is efficient when few collisions are present, e.g., low occupancy or low
+ * multiplicity.
+ *
+ * @note `Hash` should be callable object type.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ * @tparam Hash Unary callable type
+ */
+template <int32_t CGSize, typename Hash>
+class linear_probing : private detail::probing_scheme_base<CGSize> {
+ public:
+  using probing_scheme_base_type =
+    detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+  using probing_scheme_base_type::cg_size;
+
+  /**
+   *@brief Constructs linear probing scheme with the hasher callable.
+   *
+   * @param hash Hasher
+   */
+  __host__ __device__ constexpr linear_probing(Hash const& hash = {});
+
+  /**
+   * @brief Operator to return a probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key,
+                                                Extent upper_bound) const noexcept;
+
+  /**
+   * @brief Operator to return a CG-based probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param g the Cooperative Group to generate probing iterator
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(
+    cooperative_groups::thread_block_tile<cg_size> const& g,
+    ProbeKey const& probe_key,
+    Extent upper_bound) const noexcept;
+
+ private:
+  Hash hash_;
+};
+
+/**
+ * @brief Public double hashing scheme class.
+ *
+ * @note Default probing scheme for cuco data structures. It shows superior performance over linear
+ * probing especially when dealing with high multiplicty and/or high occupancy use cases.
+ *
+ * @note `Hash1` and `Hash2` should be callable object type.
+ *
+ * @note `Hash2` needs to be able to construct from an integer value to avoid secondary clustering.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ * @tparam Hash1 Unary callable type
+ * @tparam Hash2 Unary callable type
+ */
+template <int32_t CGSize, typename Hash1, typename Hash2>
+class double_hashing : private detail::probing_scheme_base<CGSize> {
+ public:
+  using probing_scheme_base_type =
+    detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+  using probing_scheme_base_type::cg_size;
+
+  /**
+   *@brief Constructs double hashing probing scheme with the two hasher callables.
+   *
+   * @param hash1 First hasher
+   * @param hash2 Second hasher
+   */
+  __host__ __device__ constexpr double_hashing(Hash1 const& hash1 = {}, Hash2 const& hash2 = {1});
+
+  /**
+   * @brief Operator to return a probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key,
+                                                Extent upper_bound) const noexcept;
+
+  /**
+   * @brief Operator to return a CG-based probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param g the Cooperative Group to generate probing iterator
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(
+    cooperative_groups::thread_block_tile<cg_size> const& g,
+    ProbeKey const& probe_key,
+    Extent upper_bound) const noexcept;
+
+ private:
+  Hash1 hash1_;
+  Hash2 hash2_;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/probing_scheme_impl.inl>
diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
new file mode 100644
index 000000000..1e6a3f589
--- /dev/null
+++ b/include/cuco/static_set.cuh
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/__config>
+#include <cuco/detail/prime.hpp>
+#include <cuco/extent.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/probing_scheme.cuh>
+#include <cuco/sentinel.cuh>
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/traits.hpp>
+
+#include <thrust/functional.h>
+
+#include <cuda/atomic>
+
+#if defined(CUCO_HAS_CUDA_BARRIER)
+#include <cuda/barrier>
+#endif
+
+#include <cstddef>
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+/**
+ * @brief A GPU-accelerated, unordered, associative container of unique keys.
+ *
+ * The `static_set` supports two types of operations:
+ * - Host-side "bulk" operations
+ * - Device-side "singular" operations
+ *
+ * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when
+ * there are a large number of keys to modify or lookup. For example, given a range of keys
+ * specified by device-accessible iterators, the bulk `insert` function will insert all keys into
+ * the set.
+ *
+ * The singular device-side operations allow individual threads (or cooperative groups) to perform
+ * independent modify or lookup operations from device code. These operations are accessed through
+ * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary
+ * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify
+ * and lookup will be supported if both kinds of operators are specified during the ref
+ * construction.
+ *
+ * @note Allows constant time concurrent modify or lookup operations from threads in device code.
+ * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking
+ * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual`
+ * should be used with caution.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>`
+ * @tparam Extent Data structure size type
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices)
+ * @tparam Allocator Type of allocator used for device storage
+ * @tparam Storage Slot window storage type
+ */
+
+template <class Key,
+          class Extent             = cuco::experimental::extent<std::size_t>,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class KeyEqual           = thrust::equal_to<Key>,
+          class ProbingScheme      = experimental::double_hashing<1,  // CG size
+                                                             cuco::murmurhash3_32<Key>,
+                                                             cuco::murmurhash3_32<Key>>,
+          class Allocator          = cuco::cuda_allocator<std::byte>,
+          class Storage            = cuco::experimental::aow_storage<2>>
+class static_set {
+  static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes.");
+
+  static_assert(
+    cuco::is_bitwise_comparable_v<Key>,
+    "Key type must have unique object representations or have been explicitly declared as safe for "
+    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
+
+  static_assert(
+    std::is_base_of_v<cuco::experimental::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                      ProbingScheme>,
+    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+
+ public:
+  static constexpr auto cg_size      = ProbingScheme::cg_size;  ///< CG size used to for probing
+  static constexpr auto window_size  = Storage::window_size;    ///< Window size used to for probing
+  static constexpr auto thread_scope = Scope;                   ///< CUDA thread scope
+
+  using key_type   = Key;  ///< Key type
+  using value_type = Key;  ///< Key type
+  /// Extent type
+  using extent_type    = decltype(make_valid_extent<cg_size, window_size>(std::declval<Extent>()));
+  using size_type      = typename extent_type::value_type;  ///< Size type
+  using key_equal      = KeyEqual;                          ///< Key equality comparator type
+  using allocator_type = Allocator;                         ///< Allocator type
+  using storage_type =
+    detail::storage<Storage, value_type, extent_type, allocator_type>;  ///< Storage type
+
+  using storage_ref_type = typename storage_type::ref_type;  ///< Non-owning window storage ref type
+  using probing_scheme_type = ProbingScheme;                 ///< Probe scheme type
+  template <typename... Operators>
+  using ref_type =
+    cuco::experimental::static_set_ref<key_type,
+                                       thread_scope,
+                                       key_equal,
+                                       probing_scheme_type,
+                                       storage_ref_type,
+                                       Operators...>;  ///< Non-owning container ref type
+
+  static_set(static_set const&) = delete;
+  static_set& operator=(static_set const&) = delete;
+
+  static_set(static_set&&) = default;  ///< Move constructor
+
+  /**
+   * @brief Replaces the contents of the container with another container.
+   *
+   * @return Reference of the current map object
+   */
+  static_set& operator=(static_set&&) = default;
+  ~static_set()                       = default;
+
+  /**
+   * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values
+   * and CUDA stream.
+   *
+   * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the
+   * window size and it's computed via `make_valid_extent` factory. Insert operations will not
+   * automatically grow the set. Attempting to insert more unique keys than the capacity of the map
+   * results in undefined behavior.
+   *
+   * The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert
+   * this sentinel value.
+   *
+   * @param capacity The requested lower-bound set size
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param pred Key equality binary predicate
+   * @param probing_scheme Probing scheme
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the map
+   */
+  constexpr static_set(Extent capacity,
+                       empty_key<Key> empty_key_sentinel,
+                       KeyEqual pred                       = {},
+                       ProbingScheme const& probing_scheme = {},
+                       Allocator const& alloc              = {},
+                       cudaStream_t stream                 = nullptr);
+
+  /**
+   * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
+   * insertions.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_set<K>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   *
+   * @return Number of successfully inserted keys
+   */
+  template <typename InputIt>
+  size_type insert(InputIt first, InputIt last, cudaStream_t stream = nullptr);
+
+  /**
+   * @brief Asynchonously inserts all keys in the range `[first, last)`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_set<K>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt>
+  void insert_async(InputIt first, InputIt last, cudaStream_t stream = nullptr);
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the set.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_async`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt output_begin,
+                cudaStream_t stream = nullptr) const;
+
+  /**
+   * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in
+   * the set.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      cudaStream_t stream = nullptr) const;
+
+  /**
+   * @brief Gets the number of elements in the container.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream used to get the number of inserted elements
+   * @return The number of elements in the container
+   */
+  [[nodiscard]] size_type size(cudaStream_t stream = nullptr) const;
+
+  /**
+   * @brief Gets the maximum number of elements the hash map can hold.
+   *
+   * @return The maximum number of elements the hash map can hold
+   */
+  [[nodiscard]] constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept;
+
+  /**
+   * @brief Get device ref with operators.
+   *
+   * @tparam Operators Set of `cuco::op` to be provided by the ref
+   *
+   * @param ops List of operators, e.g., `cuco::insert`
+   *
+   * @return Device ref of the current `static_set` object
+   */
+  template <typename... Operators>
+  [[nodiscard]] auto ref(Operators... ops) const noexcept;
+
+ private:
+  key_type empty_key_sentinel_;         ///< Key value that represents an empty slot
+  key_equal predicate_;                 ///< Key equality binary predicate
+  probing_scheme_type probing_scheme_;  ///< Probing scheme
+  allocator_type allocator_;            ///< Allocator used to (de)allocate temporary storage
+  storage_type storage_;                ///< Slot window storage
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/static_set/static_set.inl>
diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh
new file mode 100644
index 000000000..7c5ae13dc
--- /dev/null
+++ b/include/cuco/static_set_ref.cuh
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/equal_wrapper.cuh>
+#include <cuco/operator.hpp>
+#include <cuco/sentinel.cuh>
+
+#include <cuda/std/atomic>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary
+ * operations defined in `include/cuco/operator.hpp`
+ *
+ * @note Concurrent modify and lookup will be supported if both kinds of operators are specified
+ * during the ref construction.
+ * @note cuCollections data stuctures always place the slot keys on the left-hand
+ * side when invoking the key comparison predicate.
+ * @note Ref types are trivially-copyable and are intended to be passed by value.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>` returning true
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options)
+ * @tparam StorageRef Storage ref type
+ * @tparam Operators Device operator options defined in `include/cuco/operator.hpp`
+ */
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class static_set_ref
+  : public detail::operator_impl<
+      Operators,
+      static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>>... {
+  static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes.");
+
+  static_assert(
+    cuco::is_bitwise_comparable_v<Key>,
+    "Key type must have unique object representations or have been explicitly declared as safe for "
+    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
+
+  static_assert(
+    std::is_base_of_v<cuco::experimental::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                      ProbingScheme>,
+    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+
+ public:
+  using key_type            = Key;                                     ///< Key Type
+  using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using storage_ref_type    = StorageRef;                              ///< Type of storage ref
+  using window_type         = typename storage_ref_type::window_type;  ///< Window type
+  using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
+  using extent_type         = typename storage_ref_type::extent_type;  ///< Extent type
+  using size_type           = typename storage_ref_type::size_type;    ///< Probing scheme size type
+  using key_equal           = KeyEqual;  ///< Type of key equality binary callable
+
+  static constexpr auto cg_size = probing_scheme_type::cg_size;  ///< Cooperative group size
+  static constexpr auto window_size =
+    storage_ref_type::window_size;             ///< Number of elements handled per window
+  static constexpr auto thread_scope = Scope;  ///< Thread scope
+
+  /**
+   * @brief Constructs static_set_ref.
+   *
+   * @param empty_key_sentinel Sentinel indicating empty key
+   * @param predicate Key equality binary callable
+   * @param probing_scheme Probing scheme
+   * @param storage_ref Non-owning ref of slot storage
+   */
+  __host__ __device__ explicit constexpr static_set_ref(
+    cuco::empty_key<key_type> empty_key_sentinel,
+    key_equal const& predicate,
+    probing_scheme_type const& probing_scheme,
+    storage_ref_type storage_ref) noexcept;
+
+  /**
+   * @brief Gets the maximum number of elements the container can hold.
+   *
+   * @return The maximum number of elements the container can hold
+   */
+  [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept;
+
+ private:
+  cuco::empty_key<key_type> empty_key_sentinel_;            ///< Empty key sentinel
+  detail::equal_wrapper<value_type, key_equal> predicate_;  ///< Key equality binary callable
+  probing_scheme_type probing_scheme_;                      ///< Probing scheme
+  storage_ref_type storage_ref_;                            ///< Slot storage ref
+
+  // Mixins need to be friends with this class in order to access private members
+  template <typename Op, typename Ref>
+  friend class detail::operator_impl;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/static_set/static_set_ref.inl>
diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh
new file mode 100644
index 000000000..969b49f37
--- /dev/null
+++ b/include/cuco/storage.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/storage/storage.cuh>
+
+namespace cuco {
+namespace experimental {
+/**
+ * @brief Public Array of slot Windows storage class.
+ *
+ * The window size defines the workload granularity for each CUDA thread, i.e., how many slots a
+ * thread would concurrently operate on when performing modify or lookup operations. cuCollections
+ * uses the AoW storage to supersede the raw flat slot storage due to its superior granularity
+ * control: When window size equals one, AoW performs the same as the flat storage. If the
+ * underlying operation is more memory bandwidth bound, e.g., high occupancy multimap operations, a
+ * larger window size can reduce the length of probing sequences thus improve runtime performance.
+ *
+ * @tparam WindowSize Number of elements per window storage
+ */
+template <int32_t WindowSize>
+class aow_storage {
+ public:
+  /// Number of elements per window storage
+  static constexpr int32_t window_size = WindowSize;
+
+  /// Type of implementation details
+  template <class T, class Extent, class Allocator>
+  using impl = detail::aow_storage<window_size, T, Extent, Allocator>;
+};
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp
index 445a40daf..78e8dabcb 100644
--- a/include/cuco/utility/traits.hpp
+++ b/include/cuco/utility/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,4 +58,10 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable<T>::value;
   };                                                    \
   }
 
+template <bool value, typename... Args>
+inline constexpr bool dependent_bool_value = value;
+
+template <typename... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
 }  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 52c4cd9db..16f6abacf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -40,13 +40,28 @@ function(ConfigureTest TEST_NAME)
                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
     target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
       --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
-    catch_discover_tests(${TEST_NAME})
+    catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests)
 endfunction(ConfigureTest)
 
 ###################################################################################################
 ### test sources ##################################################################################
 ###################################################################################################
 
+###################################################################################################
+# - utility tests ---------------------------------------------------------------------------------
+ConfigureTest(UTILITY_TEST
+    utility/extent_test.cu
+    utility/storage_test.cu)
+
+###################################################################################################
+# - static_set tests ------------------------------------------------------------------------------
+ConfigureTest(STATIC_SET_TEST
+    static_set/capacity_test.cu
+    static_set/heterogeneous_lookup_test.cu
+    static_set/large_input_test.cu
+    static_set/size_test.cu
+    static_set/unique_sequence_test.cu)
+
 ###################################################################################################
 # - static_map tests ------------------------------------------------------------------------------
 ConfigureTest(STATIC_MAP_TEST
diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu
new file mode 100644
index 000000000..e4d3e146a
--- /dev/null
+++ b/tests/static_set/capacity_test.cu
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Static set capacity", "")
+{
+  constexpr std::size_t num_keys{400};
+  using Key = int32_t;
+  using ProbeT =
+    cuco::experimental::double_hashing<1, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>;
+  using Equal      = thrust::equal_to<Key>;
+  using AllocatorT = cuco::cuda_allocator<std::byte>;
+  using StorageT   = cuco::experimental::aow_storage<2>;
+
+  SECTION("Static extent must be evaluated at compile time.")
+  {
+    auto constexpr gold_capacity = 422;  // 211 x 2
+
+    using extent_type = cuco::experimental::extent<std::size_t, num_keys>;
+    cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, ProbeT, AllocatorT, StorageT>
+        set{extent_type{},
+            cuco::empty_key<Key>{-1},
+            {},
+            ProbeT{cuco::murmurhash3_32<Key>{}, cuco::murmurhash3_32<Key>{}},
+            {}};
+    auto const capacity = set.capacity();
+    STATIC_REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    STATIC_REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto constexpr gold_capacity = 422;  // 211 x 2
+
+    using extent_type = cuco::experimental::extent<std::size_t>;
+    cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, ProbeT, AllocatorT, StorageT>
+        set{num_keys,
+            cuco::empty_key<Key>{-1},
+            {},
+            ProbeT{cuco::murmurhash3_32<Key>{}, cuco::murmurhash3_32<Key>{}},
+            {}};
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  SECTION("Static extent must be evaluated at compile time.")
+  {
+    auto constexpr gold_capacity = 412;  // 103 x 2 x 2
+
+    using extent_type = cuco::experimental::extent<std::size_t, num_keys>;
+    using probe       = cuco::experimental::linear_probing<2, cuco::murmurhash3_32<Key>>;
+    auto set          = cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, probe, AllocatorT, StorageT>{
+        extent_type{}, cuco::empty_key<Key>{-1}, {}, probe{cuco::murmurhash3_32<Key>{}}, {}};
+
+    REQUIRE(set.capacity() == gold_capacity);
+
+    auto const capacity = set.capacity();
+    STATIC_REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    STATIC_REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto constexpr gold_capacity = 412;  // 103 x 2 x 2
+
+    using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32<Key>>;
+    auto set    = cuco::experimental::static_set<Key,
+                                              cuco::experimental::extent<std::size_t>,
+                                              cuda::thread_scope_device,
+                                              Equal,
+                                              probe,
+                                              AllocatorT,
+                                              StorageT>{
+      num_keys, cuco::empty_key<Key>{-1}, {}, probe{cuco::murmurhash3_32<Key>{}}, {}};
+
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+}
diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu
new file mode 100644
index 000000000..cbc0efac3
--- /dev/null
+++ b/tests/static_set/heterogeneous_lookup_test.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+#include <tuple>
+
+// insert key type
+template <typename T>
+struct key_pair {
+  T a;
+  T b;
+
+  __host__ __device__ key_pair() {}
+  __host__ __device__ key_pair(T x) : a{x}, b{x} {}
+
+  // Device equality operator is mandatory due to libcudacxx bug:
+  // https://github.com/NVIDIA/libcudacxx/issues/223
+  __device__ bool operator==(key_pair const& other) const { return a == other.a and b == other.b; }
+};
+
+// probe key type
+template <typename T>
+struct key_triplet {
+  T a;
+  T b;
+  T c;
+
+  __host__ __device__ key_triplet() {}
+  __host__ __device__ key_triplet(T x) : a{x}, b{x}, c{x} {}
+
+  // Device equality operator is mandatory due to libcudacxx bug:
+  // https://github.com/NVIDIA/libcudacxx/issues/223
+  __device__ bool operator==(key_triplet const& other) const
+  {
+    return a == other.a and b == other.b and c == other.c;
+  }
+};
+
+// User-defined device hasher
+struct custom_hasher {
+  template <typename CustomKey>
+  __device__ uint32_t operator()(CustomKey const& k) const
+  {
+    return thrust::raw_reference_cast(k).a;
+  };
+};
+
+// User-defined device key equality
+struct custom_key_equal {
+  template <typename LHS, typename RHS>
+  __device__ bool operator()(LHS const& lhs, RHS const& rhs) const
+  {
+    return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a;
+  }
+};
+
+TEMPLATE_TEST_CASE_SIG(
+  "Heterogeneous lookup", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), (int32_t, 2))
+{
+  using Key        = key_pair<T>;
+  using ProbeKey   = key_triplet<T>;
+  using probe_type = cuco::experimental::double_hashing<CGSize, custom_hasher, custom_hasher>;
+
+  auto const sentinel_key = Key{-1};
+
+  constexpr std::size_t num      = 100;
+  constexpr std::size_t capacity = num * 2;
+  auto const probe               = probe_type{custom_hasher{}, custom_hasher{}};
+  auto my_set                    = cuco::experimental::static_set<Key,
+                                               cuco::experimental::extent<std::size_t>,
+                                               cuda::thread_scope_device,
+                                               custom_key_equal,
+                                               probe_type>{
+    capacity, cuco::empty_key<Key>{sentinel_key}, custom_key_equal{}, probe};
+
+  auto insert_pairs = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                                      [] __device__(auto i) { return Key{i}; });
+  auto probe_keys   = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                                    [] __device__(auto i) { return ProbeKey(i); });
+
+  SECTION("All inserted keys should be contained")
+  {
+    thrust::device_vector<bool> contained(num);
+    my_set.insert(insert_pairs, insert_pairs + num);
+    my_set.contains(probe_keys, probe_keys + num, contained.begin());
+    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Non-inserted keys should not be contained")
+  {
+    thrust::device_vector<bool> contained(num);
+    my_set.contains(probe_keys, probe_keys + num, contained.begin());
+    REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), thrust::identity{}));
+  }
+}
diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu
new file mode 100644
index 000000000..6f4e5803b
--- /dev/null
+++ b/tests/static_set/large_input_test.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+template <typename Set>
+__inline__ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys)
+{
+  using Key = typename Set::key_type;
+
+  auto const keys_begin = thrust::counting_iterator<Key>(0);
+  auto const keys_end   = thrust::counting_iterator<Key>(num_keys);
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(set.size() == 0);
+
+    set.contains(keys_begin, keys_end, res_begin);
+    REQUIRE(cuco::test::none_of(res_begin, res_begin + num_keys, thrust::identity{}));
+  }
+
+  set.insert(keys_begin, keys_end);
+  REQUIRE(set.size() == num_keys);
+
+  SECTION("All inserted key/value pairs should be contained.")
+  {
+    set.contains(keys_begin, keys_end, res_begin);
+    REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{}));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Large input",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2))
+{
+  constexpr std::size_t num_keys{1'200'000'000};
+
+  using extent_type = cuco::experimental::extent<std::size_t>;
+  using probe       = cuco::experimental::
+    double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>;
+
+  try {
+    auto set = cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, thrust::equal_to<Key>, probe>{
+        num_keys * 2,
+        cuco::empty_key<Key>{-1},
+        thrust::equal_to<Key>{},
+        probe{cuco::murmurhash3_32<Key>{}, cuco::murmurhash3_32<Key>{}}};
+
+    thrust::device_vector<bool> d_contained(num_keys);
+    test_unique_sequence(set, d_contained.data().get(), num_keys);
+  } catch (cuco::cuda_error&) {
+    SKIP("Out of memory");
+  } catch (std::bad_alloc&) {
+    SKIP("Out of memory");
+  }
+}
diff --git a/tests/static_set/size_test.cu b/tests/static_set/size_test.cu
new file mode 100644
index 000000000..05b1f4f8b
--- /dev/null
+++ b/tests/static_set/size_test.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Size computation", "")
+{
+  constexpr std::size_t num_keys{400};
+
+  cuco::experimental::static_set<int> set{cuco::experimental::extent<std::size_t>{400},
+                                          cuco::empty_key{-1}};
+
+  thrust::device_vector<int> d_keys(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  auto const num_successes = set.insert(d_keys.begin(), d_keys.end());
+
+  auto const size = set.size();
+
+  REQUIRE(size == num_keys);
+  REQUIRE(num_successes == num_keys);
+}
diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu
new file mode 100644
index 000000000..c2eab7eff
--- /dev/null
+++ b/tests/static_set/unique_sequence_test.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+template <typename Set>
+__inline__ void test_unique_sequence(Set& set, std::size_t num_keys)
+{
+  using Key = typename Set::key_type;
+
+  thrust::device_vector<Key> d_keys(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  auto key_begin = d_keys.begin();
+  thrust::device_vector<bool> d_contained(num_keys);
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(set.size() == 0);
+
+    set.contains(key_begin, key_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+
+  set.insert(key_begin, key_begin + num_keys);
+  REQUIRE(set.size() == num_keys);
+
+  SECTION("All inserted key/value pairs should be contained.")
+  {
+    set.contains(key_begin, key_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Unique sequence",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr std::size_t num_keys{400};
+  auto constexpr gold_capacity = CGSize == 1 ? 422  // 211 x 1 x 2
+                                             : 412  // 103 x 2 x 2
+    ;
+
+  using extent_type = cuco::experimental::extent<std::size_t>;
+
+  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
+    using probe = cuco::experimental::linear_probing<CGSize, cuco::murmurhash3_32<Key>>;
+    auto set    = cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, thrust::equal_to<Key>, probe>{
+        num_keys,
+        cuco::empty_key<Key>{-1},
+        thrust::equal_to<Key>{},
+        probe{cuco::murmurhash3_32<Key>{}}};
+
+    REQUIRE(set.capacity() == gold_capacity);
+
+    test_unique_sequence(set, num_keys);
+  }
+
+  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
+    using probe = cuco::experimental::
+      double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>;
+    auto set = cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, thrust::equal_to<Key>, probe>{
+        num_keys,
+        cuco::empty_key<Key>{-1},
+        thrust::equal_to<Key>{},
+        probe{cuco::murmurhash3_32<Key>{}, cuco::murmurhash3_32<Key>{}}};
+
+    REQUIRE(set.capacity() == gold_capacity);
+
+    test_unique_sequence(set, num_keys);
+  }
+}
diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu
new file mode 100644
index 000000000..2623a8ae5
--- /dev/null
+++ b/tests/utility/extent_test.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/extent.cuh>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG(
+  "Extent tests", "", ((typename SizeType), SizeType), (int32_t), (int64_t), (std::size_t))
+{
+  SizeType constexpr num            = 1234;
+  SizeType constexpr gold_reference = 314;  // 157 x 2
+  auto constexpr cg_size            = 2;
+  auto constexpr window_size        = 4;
+
+  SECTION("Static extent must be evaluated at compile time.")
+  {
+    auto const size = cuco::experimental::extent<SizeType, num>{};
+    STATIC_REQUIRE(num == size);
+  }
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto const size = cuco::experimental::extent(num);
+    REQUIRE(size == num);
+  }
+
+  SECTION("Compute static valid extent at compile time.")
+  {
+    auto constexpr size = cuco::experimental::extent<SizeType, num>{};
+    auto constexpr res  = cuco::experimental::make_valid_extent<cg_size, window_size>(size);
+    STATIC_REQUIRE(gold_reference == res);
+  }
+
+  SECTION("Compute dynamic valid extent at run time.")
+  {
+    auto const size = cuco::experimental::extent<SizeType>{num};
+    auto const res  = cuco::experimental::make_valid_extent<cg_size, window_size>(size);
+    REQUIRE(gold_reference == res);
+  }
+}
diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu
new file mode 100644
index 000000000..c82b5ab44
--- /dev/null
+++ b/tests/utility/storage_test.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/pair.cuh>
+#include <cuco/detail/storage/aow_storage.cuh>
+#include <cuco/extent.cuh>
+#include <cuco/utility/allocator.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG("Storage tests",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t),
+                       (int32_t, int64_t),
+                       (int64_t, int64_t))
+{
+  constexpr std::size_t size{1'000};
+  constexpr int window_size{2};
+  constexpr std::size_t gold_capacity{2'000};
+
+  using allocator_type = cuco::cuda_allocator<char>;
+  auto allocator       = allocator_type{};
+
+  SECTION("Allocate array of pairs with AoS storage.")
+  {
+    auto s = cuco::experimental::detail::aow_storage<window_size,
+                                                     cuco::pair<Key, Value>,
+                                                     cuco::experimental::extent<std::size_t>,
+                                                     allocator_type>(
+      cuco::experimental::extent{size}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    REQUIRE(num_windows == size);
+    REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of pairs with AoS storage with static extent.")
+  {
+    using extent_type = cuco::experimental::extent<std::size_t, size>;
+    auto s            = cuco::experimental::detail::
+      aow_storage<window_size, cuco::pair<Key, Value>, extent_type, allocator_type>(extent_type{},
+                                                                                    allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    STATIC_REQUIRE(num_windows == size);
+    STATIC_REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of keys with AoS storage.")
+  {
+    auto s = cuco::experimental::detail::
+      aow_storage<window_size, Key, cuco::experimental::extent<std::size_t>, allocator_type>(
+        cuco::experimental::extent{size}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    REQUIRE(num_windows == size);
+    REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of keys with AoS storage with static extent.")
+  {
+    using extent_type = cuco::experimental::extent<std::size_t, size>;
+    auto s = cuco::experimental::detail::aow_storage<window_size, Key, extent_type, allocator_type>(
+      extent_type{}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    STATIC_REQUIRE(num_windows == size);
+    STATIC_REQUIRE(capacity == gold_capacity);
+  }
+}