From d625fca107f8250d67743b6a6a251c8c594409df Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 7 Aug 2024 16:07:05 -0400
Subject: [PATCH] Add `init` element overload to `insert_or_apply` (#555)

This PR adds a new overload to `insert_or_apply` which takes **init**
parameter that represents the identity element of the binary operator
`op` used for **apply** operation. This allows to do optimization by
skipping `wait_for_payload` in case **sentienl** value of the map equals
to the **init**.

---------

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 .../static_map/insert_or_apply_bench.cu       |   3 +-
 include/cuco/detail/static_map/helpers.cuh    | 115 +++++++
 include/cuco/detail/static_map/kernels.cuh    |  93 +++++-
 include/cuco/detail/static_map/static_map.inl | 101 +++---
 .../cuco/detail/static_map/static_map_ref.inl | 303 +++++++++++++++---
 include/cuco/operator.hpp                     |  10 -
 include/cuco/static_map.cuh                   |  55 ++++
 include/cuco/utility/reduction_functors.cuh   |  85 +++++
 tests/static_map/insert_or_apply_test.cu      | 103 +++---
 9 files changed, 714 insertions(+), 154 deletions(-)
 create mode 100644 include/cuco/detail/static_map/helpers.cuh
 create mode 100644 include/cuco/utility/reduction_functors.cuh

diff --git a/benchmarks/static_map/insert_or_apply_bench.cu b/benchmarks/static_map/insert_or_apply_bench.cu
index a9ec10662..4633a8b0f 100644
--- a/benchmarks/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/static_map/insert_or_apply_bench.cu
@@ -19,6 +19,7 @@
 
 #include <cuco/static_map.cuh>
 #include <cuco/utility/key_generator.cuh>
+#include <cuco/utility/reduction_functors.cuh>
 
 #include <nvbench/nvbench.cuh>
 
@@ -60,7 +61,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     timer.start();
     map.insert_or_apply_async(
-      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
+      pairs.begin(), pairs.end(), cuco::reduce::plus{}, {launch.get_stream()});
     timer.stop();
     map.clear_async({launch.get_stream()});
   });
diff --git a/include/cuco/detail/static_map/helpers.cuh b/include/cuco/detail/static_map/helpers.cuh
new file mode 100644
index 000000000..04d19f842
--- /dev/null
+++ b/include/cuco/detail/static_map/helpers.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/static_map/kernels.cuh>
+#include <cuco/detail/utility/cuda.cuh>
+
+namespace cuco::static_map_ns::detail {
+
+/**
+ * @brief Dispatches to shared memory map kernel if `num_elements_per_thread > 2`, else
+ * fallbacks to global memory map kernel.
+ *
+ * @tparam HasInit Boolean to dispatch based on init parameter
+ * @tparam CGSize Number of threads in each CG
+ * @tparam Allocator Allocator type used to created shared_memory map
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Init Type of init value convertible to payload type
+ * @tparam Op Callable type used to peform `apply` operation.
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param last End of the sequence of input elements
+ * @param init The init value of the `op`
+ * @param op Callable object to perform apply operation.
+ * @param ref Non-owning container device ref used to access the slot storage
+ * @param stream CUDA stream used for insert_or_apply operation
+ */
+template <bool HasInit,
+          int32_t CGSize,
+          typename Allocator,
+          typename InputIt,
+          typename Init,
+          typename Op,
+          typename Ref>
+void dispatch_insert_or_apply(
+  InputIt first, InputIt last, Init init, Op op, Ref ref, cuda::stream_ref stream)
+{
+  auto const num = cuco::detail::distance(first, last);
+  if (num == 0) { return; }
+
+  int32_t const default_grid_size = cuco::detail::grid_size(num, CGSize);
+
+  if constexpr (CGSize == 1) {
+    using shmem_size_type = int32_t;
+
+    int32_t constexpr shmem_block_size                = 1024;
+    shmem_size_type constexpr cardinality_threshold   = shmem_block_size;
+    shmem_size_type constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size;
+    float constexpr load_factor                       = 0.7;
+    shmem_size_type constexpr shared_map_size =
+      static_cast<shmem_size_type>((1.0 / load_factor) * shared_map_num_elements);
+
+    using extent_type     = cuco::extent<shmem_size_type, shared_map_size>;
+    using shared_map_type = cuco::static_map<typename Ref::key_type,
+                                             typename Ref::mapped_type,
+                                             extent_type,
+                                             cuda::thread_scope_block,
+                                             typename Ref::key_equal,
+                                             typename Ref::probing_scheme_type,
+                                             Allocator,
+                                             cuco::storage<1>>;
+
+    using shared_map_ref_type    = typename shared_map_type::ref_type<>;
+    auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
+
+    auto insert_or_apply_shmem_fn_ptr = insert_or_apply_shmem<HasInit,
+                                                              CGSize,
+                                                              shmem_block_size,
+                                                              shared_map_ref_type,
+                                                              InputIt,
+                                                              Init,
+                                                              Op,
+                                                              Ref>;
+
+    int32_t const max_op_grid_size =
+      cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr);
+
+    int32_t const shmem_default_grid_size =
+      cuco::detail::grid_size(num, CGSize, cuco::detail::default_stride(), shmem_block_size);
+
+    auto const shmem_grid_size         = std::min(shmem_default_grid_size, max_op_grid_size);
+    auto const num_elements_per_thread = num / (shmem_grid_size * shmem_block_size);
+
+    // use shared_memory only if each thread has atleast 3 elements to process
+    if (num_elements_per_thread > 2) {
+      insert_or_apply_shmem<HasInit, CGSize, shmem_block_size, shared_map_ref_type>
+        <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(
+          first, num, init, op, ref, window_extent);
+    } else {
+      insert_or_apply<HasInit, CGSize, cuco::detail::default_block_size()>
+        <<<default_grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+          first, num, init, op, ref);
+    }
+  } else {
+    insert_or_apply<HasInit, CGSize, cuco::detail::default_block_size()>
+      <<<default_grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+        first, num, init, op, ref);
+  }
+}
+}  // namespace cuco::static_map_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index cbf1cef06..c05d0b28b 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -77,47 +77,101 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
  *
  * @note Callable object to perform binary operation should be able to invoke as
  * Op(cuda::atomic_ref<T,Scope>, T>)
+ * @note If `HasInit` is `true` and if `init == empty_sentinel_value`, we directly
+ * `apply` the `op` instead of atomic store and then waiting for the payload to get materalized.
+ * This has potential speedups when insert strategy is not `packed_cas`.
  *
+ * @tparam HasInit Boolean to dispatch based on init parameter
  * @tparam CGSize Number of threads in each CG
  * @tparam BlockSize Number of threads in each block
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
+ * @tparam Init Type of init value convertible to payload type
  * @tparam Op Callable type used to peform `apply` operation.
  * @tparam Ref Type of non-owning device ref allowing access to storage
  *
  * @param first Beginning of the sequence of input elements
  * @param n Number of input elements
+ * @param init The init value of the op
  * @param op Callable object to perform apply operation.
  * @param ref Non-owning container device ref used to access the slot storage
  */
-template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
-__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
+template <bool HasInit,
+          int32_t CGSize,
+          int32_t BlockSize,
+          typename InputIt,
+          typename Init,
+          typename Op,
+          typename Ref>
+__global__ void insert_or_apply(
+  InputIt first, cuco::detail::index_type n, [[maybe_unused]] Init init, Op op, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
+    using value_type              = typename std::iterator_traits<InputIt>::value_type;
+    value_type const& insert_pair = *(first + idx);
     if constexpr (CGSize == 1) {
-      ref.insert_or_apply(insert_pair, op);
+      if constexpr (HasInit) {
+        ref.insert_or_apply(insert_pair, init, op);
+      } else {
+        ref.insert_or_apply(insert_pair, op);
+      }
     } else {
       auto const tile =
         cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
-      ref.insert_or_apply(tile, insert_pair, op);
+      if constexpr (HasInit) {
+        ref.insert_or_apply(tile, insert_pair, init, op);
+      } else {
+        ref.insert_or_apply(tile, insert_pair, op);
+      }
     }
     idx += loop_stride;
   }
 }
 
-template <int32_t CGSize,
+/**
+ * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
+ * `k` already exists in the container, then binary operation is applied using `op` callable object
+ * on the existing value at slot and the element to insert. If the key does not exist, inserts the
+ * pair as if by insert.
+ *
+ * @note Callable object to perform binary operation should be able to invoke as
+ * Op(cuda::atomic_ref<T,Scope>, T>)
+ * @note If `HasInit` is `true` and if `init == empty_sentinel_value`, we directly
+ * `apply` the `op` instead of atomic store and then waiting for the payload to get materalized.
+ * This has potential speedups when insert strategy is not `packed_cas`.
+ *
+ * @tparam HasInit Boolean to dispatch based on init parameter
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam SharedMapRefType The Shared Memory Map Ref Type
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Init Type of init value convertible to payload type
+ * @tparam Op Callable type used to peform `apply` operation.
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param init The init value of the op
+ * @param op Callable object to perform apply operation.
+ * @param ref Non-owning container device ref used to access the slot storage
+ * @param window_extent Window Extent used for shared memory map slot storage
+ */
+template <bool HasInit,
+          int32_t CGSize,
           int32_t BlockSize,
           class SharedMapRefType,
           class InputIt,
+          class Init,
           class Op,
           class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   InputIt first,
   cuco::detail::index_type n,
+  [[maybe_unused]] Init init,
   Op op,
   Ref ref,
   typename SharedMapRefType::extent_type window_extent)
@@ -157,18 +211,22 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   block.sync();
 
   while ((idx - thread_idx / CGSize) < n) {
-    int32_t inserted          = 0;
-    int32_t local_cardinality = 0;
+    int32_t inserted         = 0;
+    int32_t warp_cardinality = 0;
     // insert-or-apply into the shared map first
     if (idx < n) {
       value_type const& insert_pair = *(first + idx);
-      inserted                      = shared_map_ref.insert_or_apply(insert_pair, op);
+      if constexpr (HasInit) {
+        inserted = shared_map_ref.insert_or_apply(insert_pair, init, op);
+      } else {
+        inserted = shared_map_ref.insert_or_apply(insert_pair, op);
+      }
     }
     if (idx - warp_thread_idx < n) {  // all threads in warp particpate
-      local_cardinality = cg::reduce(warp, inserted, cg::plus<int32_t>());
+      warp_cardinality = cg::reduce(warp, inserted, cg::plus<int32_t>());
     }
     if (warp_thread_idx == 0) {
-      block_cardinality.fetch_add(local_cardinality, cuda::memory_order_relaxed);
+      block_cardinality.fetch_add(warp_cardinality, cuda::memory_order_relaxed);
     }
     block.sync();
     if (block_cardinality > BlockSize) { break; }
@@ -180,7 +238,11 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   while (window_idx < num_windows) {
     auto const slot = storage[window_idx][0];
     if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) {
-      ref.insert_or_apply(slot, op);
+      if constexpr (HasInit) {
+        ref.insert_or_apply(slot, init, op);
+      } else {
+        ref.insert_or_apply(slot, op);
+      }
     }
     window_idx += BlockSize;
   }
@@ -191,10 +253,13 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
     idx += loop_stride;
     while (idx < n) {
       value_type const& insert_pair = *(first + idx);
-      ref.insert_or_apply(insert_pair, op);
+      if constexpr (HasInit) {
+        ref.insert_or_apply(insert_pair, init, op);
+      } else {
+        ref.insert_or_apply(insert_pair, op);
+      }
       idx += loop_stride;
     }
   }
 }
-
 }  // namespace cuco::static_map_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 8f7b58ab1..86b75507d 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -15,6 +15,7 @@
  */
 
 #include <cuco/detail/bitwise_compare.cuh>
+#include <cuco/detail/static_map/helpers.cuh>
 #include <cuco/detail/static_map/kernels.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/detail/utils.hpp>
@@ -304,6 +305,22 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   stream.wait();
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Init, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply(InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream)
+{
+  this->insert_or_apply_async(first, last, init, op, stream);
+  stream.wait();
+}
+
 template <class Key,
           class T,
           class Extent,
@@ -316,62 +333,36 @@ template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
   insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept
 {
-  auto const num = cuco::detail::distance(first, last);
-  if (num == 0) { return; }
+  auto constexpr has_init = false;
+  auto const init = this->empty_value_sentinel();  // use empty_sentinel as unused init value
+  static_map_ns::detail::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
+    first, last, init, op, ref(op::insert_or_apply), stream);
+}
 
-  using shmem_size_type = int32_t;
-
-  int32_t const default_grid_size = cuco::detail::grid_size(num, cg_size);
-
-  if constexpr (cg_size == 1) {
-    int32_t constexpr shmem_block_size                = 1024;
-    shmem_size_type constexpr cardinality_threshold   = shmem_block_size;
-    shmem_size_type constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size;
-    float constexpr load_factor                       = 0.7;
-    shmem_size_type constexpr shared_map_size =
-      static_cast<shmem_size_type>((1.0 / load_factor) * shared_map_num_elements);
-
-    using extent_type            = cuco::extent<shmem_size_type, shared_map_size>;
-    using shared_map_type        = cuco::static_map<Key,
-                                                    T,
-                                                    extent_type,
-                                                    cuda::thread_scope_block,
-                                                    KeyEqual,
-                                                    ProbingScheme,
-                                                    Allocator,
-                                                    cuco::storage<1>>;
-    using shared_map_ref_type    = typename shared_map_type::ref_type<>;
-    auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
-
-    using ref_type = decltype(ref(op::insert_or_apply));
-
-    auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail::
-      insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type, InputIt, Op, ref_type>;
-
-    int32_t const max_op_grid_size =
-      cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr);
-
-    int32_t const shmem_default_grid_size =
-      cuco::detail::grid_size(num, cg_size, cuco::detail::default_stride(), shmem_block_size);
-
-    auto const shmem_grid_size         = std::min(shmem_default_grid_size, max_op_grid_size);
-    auto const num_elements_per_thread = num / (shmem_grid_size * shmem_block_size);
-
-    // use shared_memory only if each thread has atleast 3 elements to process
-    if (num_elements_per_thread > 2) {
-      static_map_ns::detail::insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type>
-        <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(
-          first, num, op, ref(op::insert_or_apply), window_extent);
-    } else {
-      static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
-        <<<default_grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
-          first, num, op, ref(op::insert_or_apply));
-    }
-  } else {
-    static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
-      <<<default_grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
-        first, num, op, ref(op::insert_or_apply));
-  }
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Init, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply_async(
+    InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream) noexcept
+{
+  using shared_map_type   = cuco::static_map<Key,
+                                             T,
+                                             int32_t,
+                                             cuda::thread_scope_block,
+                                             KeyEqual,
+                                             ProbingScheme,
+                                             Allocator,
+                                             cuco::storage<1>>;
+  auto constexpr has_init = true;
+  static_map_ns::detail::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
+    first, last, init, op, ref(op::insert_or_apply), stream);
 }
 
 template <class Key,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index a9a8a3e1c..39ef375bc 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -16,11 +16,13 @@
 
 #pragma once
 
+#include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/equal_wrapper.cuh>
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
 #include <cuda/std/functional>
+#include <cuda/std/type_traits>
 #include <thrust/tuple.h>
 
 #include <cooperative_groups.h>
@@ -587,6 +589,190 @@ class operator_impl<
       cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
       "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
 
+    auto& ref_ = static_cast<ref_type&>(*this);
+
+    // directly dispatch to implementation if no init element is given
+    auto constexpr use_direct_apply = false;
+    return ref_.insert_or_apply_impl<use_direct_apply>(value, op);
+  }
+
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+
+   * @param value The element to insert
+   * @param init The init value of the op
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
+   */
+  template <typename Value,
+            typename Init,
+            typename Op,
+            typename = cuda::std::enable_if_t<std::is_convertible_v<Value, value_type>>>
+  __device__ bool insert_or_apply(Value const& value, Init init, Op op)
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    static_assert(
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
+    auto& ref_ = static_cast<ref_type&>(*this);
+    return ref_.dispatch_insert_or_apply(value, init, op);
+  }
+
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
+   */
+
+  template <typename Value, typename Op>
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  Op op)
+  {
+    static_assert(
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+    auto& ref_ = static_cast<ref_type&>(*this);
+
+    auto constexpr use_direct_apply = false;
+    return ref_.insert_or_apply_impl<use_direct_apply>(group, value, op);
+  }
+
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @param init The init value of the op
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
+   */
+  template <typename Value, typename Init, typename Op>
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  Init init,
+                                  Op op)
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+    static_assert(
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
+    return ref_.dispatch_insert_or_apply(group, value, init, op);
+  }
+
+ private:
+  /**
+   * @brief dispatches `insert_or_apply_impl` based on condition `init == empty_value_sentinel`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param value The element to insert
+   * @param init The init value of the op
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
+   */
+  template <typename Value, typename Init, typename Op>
+  __device__ bool dispatch_insert_or_apply(Value const& value, Init init, Op op)
+  {
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+    // if init equals sentinel value, then we can just `apply` op instead of write
+    if (cuco::detail::bitwise_compare(init, ref_.empty_value_sentinel())) {
+      return ref_.insert_or_apply_impl<true>(value, op);
+    } else {
+      return ref_.insert_or_apply_impl<false>(value, op);
+    }
+  }
+
+  /**
+   * @brief dispatches `insert_or_apply_impl` based on condition `init == empty_value_sentinel`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @param init The init value of the op
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   */
+  template <typename Value, typename Init, typename Op>
+  __device__ bool dispatch_insert_or_apply(
+    cooperative_groups::thread_block_tile<cg_size> const& group,
+    Value const& value,
+    Init init,
+    Op op)
+  {
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+    // if init equals sentinel value, then we can just `apply` op instead of write
+    if (cuco::detail::bitwise_compare(init, ref_.empty_value_sentinel())) {
+      return ref_.insert_or_apply_impl<true>(group, value, op);
+    } else {
+      return ref_.insert_or_apply_impl<false>(group, value, op);
+    }
+  }
+
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam UseDirectApply Boolean condition which enables direct apply `op` instead of
+   * wait_for_payload with an atomic_store on an empty slot.
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
+   */
+  template <bool UseDirectApply, typename Value, typename Op>
+  __device__ bool insert_or_apply_impl(Value const& value, Op op)
+  {
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
     auto const val         = ref_.impl_.heterogeneous_value(value);
@@ -594,7 +780,10 @@ class operator_impl<
     auto& probing_scheme   = ref_.impl_.probing_scheme();
     auto storage_ref       = ref_.impl_.storage_ref();
     auto probing_iter      = probing_scheme(key, storage_ref.window_extent());
-    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+    auto const empty_value = ref_.empty_value_sentinel();
+
+    // wait for payload only when init != sentinel and insert strategy is not `packed_cas`
+    auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8);
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -607,17 +796,19 @@ class operator_impl<
 
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
-          if constexpr (sizeof(value_type) > 8) {
+          // wait for payload only when performing insert operation
+          if constexpr (wait_for_payload) {
             ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
           }
           op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
           return false;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
-          switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
+          switch (ref_.attempt_insert_or_apply<UseDirectApply>(slot_ptr, slot_content, val, op)) {
             case insert_result::SUCCESS: return true;
             case insert_result::DUPLICATE: {
-              if constexpr (sizeof(value_type) > 8) {
+              // wait for payload only when performing insert operation
+              if constexpr (wait_for_payload) {
                 ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
               }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
@@ -631,42 +822,31 @@ class operator_impl<
     }
   }
 
-  template <typename Value>
-  __device__ bool insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
-  {
-    auto& ref_ = static_cast<ref_type&>(*this);
-    return ref_.insert_or_apply(value,
-                                [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
-                                  payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
-                                });
-  }
-
   /**
    * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
    * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
    *
+   * @tparam UseDirectApply Boolean condition which enables direct apply `op` instead of
+   * wait_for_payload with an atomic_store on an empty slot.
    * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Init Type of init value convertible to payload type
    * @tparam Op Callable type which is used as apply operation and can be
    *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
    *   have this signature to atomically apply the operation.
    *
    * @param group The Cooperative Group used to perform group insert
    * @param value The element to insert
+   * @param init The init value of the op
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
    *
    * @return Returns `true` if the given `value` is inserted successfully.
    */
-
-  template <typename Value, typename Op>
-  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
-                                  Value const& value,
-                                  Op op)
+  template <bool UseDirectApply, typename Value, typename Op>
+  __device__ bool insert_or_apply_impl(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                       Value const& value,
+                                       Op op)
   {
-    static_assert(
-      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
-      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
-
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
     auto const val         = ref_.impl_.heterogeneous_value(value);
@@ -674,7 +854,10 @@ class operator_impl<
     auto& probing_scheme   = ref_.impl_.probing_scheme();
     auto storage_ref       = ref_.impl_.storage_ref();
     auto probing_iter      = probing_scheme(group, key, storage_ref.window_extent());
-    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+    auto const empty_value = ref_.empty_value_sentinel();
+
+    // wait for payload only when init != sentinel and insert strategy is not `packed_cas`
+    auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8);
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -697,7 +880,7 @@ class operator_impl<
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          if constexpr (sizeof(value_type) > 8) {
+          if constexpr (wait_for_payload) {
             ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
           }
           op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
@@ -710,14 +893,15 @@ class operator_impl<
         auto const src_lane = __ffs(group_contains_available) - 1;
         auto const status   = [&, target_idx = intra_window_index]() {
           if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
-          return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val);
+          return ref_.attempt_insert_or_apply<UseDirectApply>(
+            slot_ptr, window_slots[target_idx], val, op);
         }();
 
         switch (group.shfl(status, src_lane)) {
           case insert_result::SUCCESS: return true;
           case insert_result::DUPLICATE: {
             if (group.thread_rank() == src_lane) {
-              if constexpr (sizeof(value_type) > 8) {
+              if constexpr (wait_for_payload) {
                 ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
               }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
@@ -732,16 +916,63 @@ class operator_impl<
     }
   }
 
-  template <typename Value>
-  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
-                                  Value const& value,
-                                  cuco::op::reduce::sum_tag)
+  /**
+   * @brief Attempts to insert a key-value pair `{k, v}` if it's not present in the map. Otherwise,
+   * applies `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam UseDirectApply Boolean condition which enables direct apply `op` instead of
+   * wait_for_payload with an atomic_store on an empty slot when insert strategy is not
+   * `packed_cas`.
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param address Pointer to the slot in memory
+   * @param expected Element to compare against
+   * @param desired Element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   */
+  template <bool UseDirectApply, typename Value, typename Op>
+  [[nodiscard]] __device__ insert_result attempt_insert_or_apply(value_type* address,
+                                                                 value_type const& expected,
+                                                                 Value const& desired,
+                                                                 Op op) noexcept
   {
-    auto& ref_ = static_cast<ref_type&>(*this);
-    return ref_.insert_or_apply(
-      group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
-        payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
-      });
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    if constexpr (sizeof(value_type) <= 8) {
+      return ref_.impl_.packed_cas(address, expected, desired);  // no need to wait for payload
+    } else {
+      using mapped_type = T;
+
+      cuda::atomic_ref<key_type, Scope> key_ref(address->first);
+      auto expected_key  = expected.first;
+      auto const success = key_ref.compare_exchange_strong(
+        expected_key, static_cast<key_type>(desired.first), cuda::memory_order_relaxed);
+
+      // if key success
+      if (success) {
+        cuda::atomic_ref<mapped_type, Scope> payload_ref(address->second);
+        // if init values == sentinel then directly apply the `op`
+        if constexpr (UseDirectApply) {
+          op(payload_ref, desired.second);
+        } else {
+          payload_ref.store(desired.second, cuda::memory_order_relaxed);
+        }
+        return insert_result::SUCCESS;
+      }
+
+      // Our key was already present in the slot, so our key is a duplicate
+      // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare
+      if (ref_.impl_.predicate_.equal_to(desired.first, expected_key) ==
+          detail::equal_result::EQUAL) {
+        return insert_result::DUPLICATE;
+      }
+
+      return insert_result::CONTINUE;
+    }
   }
 };
 
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index cb4cfd4de..303124ec4 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -74,16 +74,6 @@ struct find_tag {
 struct for_each_tag {
 } inline constexpr for_each;  ///< `cuco::for_each` operator
 
-namespace reduce {
-
-/**
- * @brief `sum` reduction operator tag
- */
-struct sum_tag {
-} inline constexpr sum;
-
-}  // namespace reduce
-
 }  // namespace op
 }  // namespace cuco
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 2559f6427..9c87e45a9 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -488,6 +488,34 @@ class static_map {
   template <typename InputIt, typename Op>
   void insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream = {});
 
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_or_apply_async`.
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
+   * @note There could be performance improvements if `init` value passed here equals to the
+   *  `sentinel value` of the map.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type used to peform `apply` operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param init The identity value of the op
+   * @param op Callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt, typename Init, typename Op>
+  void insert_or_apply(InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream = {});
+
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
    * to `k` already exists in the container, then binary operation is applied using `op` callable
@@ -513,6 +541,33 @@ class static_map {
                              Op op,
                              cuda::stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
+   * @note There could be performance improvements if `init` value passed here equals to the
+   *  `sentinel value` of the map.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Init Type of init value convertible to payload type
+   * @tparam Op Callable type used to peform `apply` operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param init The identity value of the op
+   * @param op Callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt, typename Init, typename Op>
+  void insert_or_apply_async(
+    InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream = {}) noexcept;
+
   /**
    * @brief Erases keys in the range `[first, last)`.
    *
diff --git a/include/cuco/utility/reduction_functors.cuh b/include/cuco/utility/reduction_functors.cuh
new file mode 100644
index 000000000..eaf6b03a0
--- /dev/null
+++ b/include/cuco/utility/reduction_functors.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/atomic>
+
+namespace cuco::reduce {
+
+/**
+ * @brief Device functor performing sum reduction, used with `insert-or-apply`
+ */
+struct plus {
+  /**
+   * @brief Performs atomic fetch_add on payload and the new value to be inserted
+   *
+   * @tparam T The payload type
+   * @tparam Scope The cuda::thread_scope used for atomic_ref
+   *
+   * @param payload_ref The atomic_ref pointing to payload part of the slot
+   * @param val The new value to be applied as reduction to the current value
+   * in the payload.
+   */
+  template <typename T, cuda::thread_scope Scope>
+  __device__ void operator()(cuda::atomic_ref<T, Scope> payload_ref, const T& val)
+  {
+    payload_ref.fetch_add(val, cuda::memory_order_relaxed);
+  }
+};
+
+/**
+ * @brief Device functor performing max reduction, used with `insert-or-apply`
+ */
+struct max {
+  /**
+   * @brief Performs atomic fetch_max on payload and the new value to be inserted
+   *
+   * @tparam T The payload type
+   * @tparam Scope The cuda::thread_scope used for atomic_ref
+   *
+   * @param payload_ref The atomic_ref pointing to payload part of the slot
+   * @param val The new value to be applied as reduction to the current value
+   * in the payload.
+   */
+  template <typename T, cuda::thread_scope Scope>
+  __device__ void operator()(cuda::atomic_ref<T, Scope> payload_ref, const T& val)
+  {
+    payload_ref.fetch_max(val, cuda::memory_order_relaxed);
+  }
+};
+
+/**
+ * @brief Device functor performing min reduction, used with `insert-or-apply`
+ */
+struct min {
+  /**
+   * @brief Performs atomic fetch_min on payload and the new value to be inserted
+   *
+   * @tparam T The payload type
+   * @tparam Scope The cuda::thread_scope used for atomic_ref
+   *
+   * @param payload_ref The atomic_ref pointing to payload part of the slot
+   * @param val The new value to be applied as reduction to the current value
+   * in the payload.
+   */
+  template <typename T, cuda::thread_scope Scope>
+  __device__ void operator()(cuda::atomic_ref<T, Scope> payload_ref, const T& val)
+  {
+    payload_ref.fetch_min(val, cuda::memory_order_relaxed);
+  }
+};
+
+}  // namespace cuco::reduce
\ No newline at end of file
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index e27013949..1b818e38c 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -17,6 +17,7 @@
 #include <test_utils.hpp>
 
 #include <cuco/static_map.cuh>
+#include <cuco/utility/reduction_functors.cuh>
 
 #include <cuda/atomic>
 #include <thrust/device_vector.h>
@@ -29,20 +30,11 @@
 #include <catch2/catch_template_test_macros.hpp>
 
 #include <cstdint>
-#include <iostream>
 
 using size_type = std::size_t;
 
-struct binary_plus_op {
-  template <typename T, cuda::thread_scope Scope>
-  __device__ void operator()(cuda::atomic_ref<T, Scope> lhs, T rhs)
-  {
-    lhs.fetch_add(rhs, cuda::memory_order_relaxed);
-  }
-};
-
-template <typename Map>
-void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
+template <bool HasInit, typename Map, typename Init>
+void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys, Init init)
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
@@ -56,7 +48,12 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key
       return cuco::pair<Key, Value>{i % num_unique_keys, 1};
     }));
 
-  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, binary_plus_op{});
+  auto constexpr plus_op = cuco::reduce::plus{};
+  if constexpr (HasInit) {
+    map.insert_or_apply(pairs_begin, pairs_begin + num_keys, init, plus_op);
+  } else {
+    map.insert_or_apply(pairs_begin, pairs_begin + num_keys, plus_op);
+  }
 
   REQUIRE(map.size() == num_unique_keys);
 
@@ -70,8 +67,8 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key
                             thrust::equal_to<Value>{}));
 }
 
-template <typename Map>
-void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_unique_keys)
+template <bool HasInit, typename Map, typename Init>
+void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_unique_keys, Init init)
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
@@ -118,9 +115,14 @@ void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_uniq
   cuda::stream_ref stream{};
 
   // launch the shmem kernel
-  cuco::static_map_ns::detail::insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type>
-    <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(
-      pairs_begin, num_keys, binary_plus_op{}, map.ref(cuco::op::insert_or_apply), window_extent);
+  cuco::static_map_ns::detail::
+    insert_or_apply_shmem<HasInit, cg_size, shmem_block_size, shared_map_ref_type>
+    <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(pairs_begin,
+                                                             num_keys,
+                                                             init,
+                                                             cuco::reduce::plus{},
+                                                             map.ref(cuco::op::insert_or_apply),
+                                                             window_extent);
 
   REQUIRE(map.size() == num_unique_keys);
 
@@ -176,18 +178,25 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuco::cuda_allocator<std::byte>,
                                     cuco::storage<2>>;
 
-  SECTION("Sentinel equals to identity")
+  SECTION("sentinel equals init; has_init = true")
   {
     auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-
-    test_insert_or_apply(map, num_keys, num_unique_keys);
+    test_insert_or_apply<true>(map, num_keys, num_unique_keys, static_cast<Value>(0));
   }
-
-  SECTION("Sentinel not equals to identity")
+  SECTION("sentinel equals init; has_init = false")
   {
-    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
-
-    test_insert_or_apply(map, num_keys, num_unique_keys);
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<false>(map, num_keys, num_unique_keys, static_cast<Value>(0));
+  }
+  SECTION("sentinel not equals init; has_init = true")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<true>(map, num_keys, num_unique_keys, static_cast<Value>(-1));
+  }
+  SECTION("sentinel not equals init; has_init = false")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<false>(map, num_keys, num_unique_keys, static_cast<Value>(-1));
   }
 }
 
@@ -198,17 +207,35 @@ TEMPLATE_TEST_CASE_SIG(
 
   constexpr size_type num_keys = 100;
 
-  auto map = cuco::static_map<Key,
-                              Value,
-                              cuco::extent<size_type>,
-                              cuda::thread_scope_device,
-                              thrust::equal_to<Key>,
-                              cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
-                              cuco::cuda_allocator<std::byte>,
-                              cuco::storage<2>>{
-    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-
-  test_insert_or_apply(map, num_keys, num_keys);
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    cuco::linear_probing<2, cuco::murmurhash3_32<Key>>,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  SECTION("sentinel equals init; has_init = true")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<true>(map, num_keys, num_keys, static_cast<Value>(0));
+  }
+  SECTION("sentinel equals init; has_init = false")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<false>(map, num_keys, num_keys, static_cast<Value>(0));
+  }
+  SECTION("sentinel not equals init; has_init = true")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<true>(map, num_keys, num_keys, static_cast<Value>(-1));
+  }
+  SECTION("sentinel not equals init; has_init = false")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply<false>(map, num_keys, num_keys, static_cast<Value>(-1));
+  }
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -231,7 +258,7 @@ TEMPLATE_TEST_CASE_SIG(
     constexpr size_type num_unique_keys = 100;
 
     auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-    test_insert_or_apply_shmem(map, num_keys, num_unique_keys);
+    test_insert_or_apply_shmem<true>(map, num_keys, num_unique_keys, static_cast<Value>(0));
   }
 
   SECTION("unique keys")
@@ -240,6 +267,6 @@ TEMPLATE_TEST_CASE_SIG(
     constexpr size_type num_unique_keys = num_keys;
 
     auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-    test_insert_or_apply_shmem(map, num_keys, num_unique_keys);
+    test_insert_or_apply_shmem<true>(map, num_keys, num_unique_keys, static_cast<Value>(0));
   }
 }