From e94f801fc71d8083c33035be9d7b6622e6786f67 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 18 Sep 2023 19:28:44 -0700
Subject: [PATCH 1/7] fix compile errors with cccl 2.x

---
 cpp/include/cugraph/utilities/misc_utils.cuh  |  5 +-
 .../cugraph/utilities/shuffle_comm.cuh        | 50 ++++++-----
 cpp/src/community/detail/mis_impl.cuh         | 16 ++--
 cpp/src/community/detail/refine_impl.cuh      | 37 +++++----
 cpp/src/detail/collect_local_vertex_values.cu |  5 +-
 cpp/src/generators/erdos_renyi_generator.cu   | 22 +++--
 cpp/src/generators/simple_generators.cu       | 37 +++++----
 ...r_v_random_select_transform_outgoing_e.cuh | 16 ++--
 ...v_transform_reduce_incoming_outgoing_e.cuh | 25 +++---
 .../prims/update_edge_src_dst_property.cuh    | 82 +++++++++++--------
 cpp/src/sampling/random_walks.cuh             | 41 ++++++----
 cpp/src/structure/graph_view_impl.cuh         |  6 +-
 .../renumber_sampled_edgelist_test.cu         | 23 +++---
 13 files changed, 214 insertions(+), 151 deletions(-)
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index a62e8ce85ec..0dad22cdc50 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -25,6 +25,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -43,7 +45,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
 {
   auto search_offset_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{1}),
-    [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
+    cuda::proclaim_return_type<size_t>(
+      [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
   auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
 
   if (num_chunks > 1) {
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 6a260144324..22bc5b6b6f1 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -198,12 +200,13 @@ void multi_partition(ValueIterator value_first,
     value_last,
     thrust::make_zip_iterator(
       thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-    [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
-      auto group_id = value_to_group_id_op(value);
-      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-      return thrust::make_tuple(group_id,
-                                counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-    });
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
+        auto group_id = value_to_group_id_op(value);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -246,17 +249,19 @@ void multi_partition(KeyIterator key_first,
   rmm::device_uvector<int> group_ids(num_keys, stream_view);
   rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
   thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
-  thrust::transform(rmm::exec_policy(stream_view),
-                    key_first,
-                    key_last,
-                    thrust::make_zip_iterator(
-                      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-                    [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
-                      auto group_id = key_to_group_id_op(key);
-                      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-                      return thrust::make_tuple(
-                        group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    thrust::make_zip_iterator(
+      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
+        auto group_id = key_to_group_id_op(key);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -492,7 +497,7 @@ std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
     key_first,
     key_last,
     key_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type, KeyToGroupIdOp>{
-      key_to_group_id_op, pivot}));
+        key_to_group_id_op, pivot}));
   auto second_size  = num_elements - first_size;
 
   auto tmp_key_buffer =
@@ -762,8 +767,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+    tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
+      return value_to_group_id_op(value);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
@@ -796,7 +802,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+    tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
+      return key_to_group_id_op(key);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index bcd71af5a08..2659a982183 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -37,6 +37,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#include <cuda/functional>
+
 #include <cmath>
 
 namespace cugraph {
@@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(
-    handle.get_thrust_policy(),
-    out_degrees.begin(),
-    out_degrees.end(),
-    ranks.begin(),
-    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
-    [] __device__(auto deg) { return deg == 0; });
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index e811aafc776..2e615064286 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -46,12 +46,15 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
 namespace cuco {
 template <>
-struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {};
+struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {
+};
 }  // namespace cuco
 
 namespace cugraph {
@@ -236,16 +239,17 @@ refine_clustering(
                                                  weighted_degree_of_vertices.end(),
                                                  vertex_louvain_cluster_weights.end()));
 
-  thrust::transform(handle.get_thrust_policy(),
-                    wcut_deg_and_cluster_vol_triple_begin,
-                    wcut_deg_and_cluster_vol_triple_end,
-                    singleton_and_connected_flags.begin(),
-                    [resolution] __device__(auto wcut_wdeg_and_louvain_volume) {
-                      auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
-                      auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
-                      auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut > (resolution * wdeg * (louvain_volume - wdeg));
-                    });
+  thrust::transform(
+    handle.get_thrust_policy(),
+    wcut_deg_and_cluster_vol_triple_begin,
+    wcut_deg_and_cluster_vol_triple_end,
+    singleton_and_connected_flags.begin(),
+    cuda::proclaim_return_type<uint8_t>([resolution] __device__(auto wcut_wdeg_and_louvain_volume) {
+      auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
+      auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
+      auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
+      return (wcut > (resolution * wdeg * (louvain_volume - wdeg))) ? uint8_t{1} : uint8_t{0};
+    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
@@ -714,11 +718,12 @@ refine_clustering(
       vertices_in_mis.begin(),
       vertices_in_mis.end(),
       dst_vertices.begin(),
-      [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-       v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
-        auto dst = *(dst_first + v - v_first);
-        return dst;
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+         v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+          auto dst = *(dst_first + v - v_first);
+          return dst;
+        }));
 
     cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
     cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 9d5d2cb553b..795902dfd87 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -19,6 +19,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
+#include <cuda/functional>
+
 namespace cugraph {
 namespace detail {
 
@@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
   auto vertex_iterator = thrust::make_transform_iterator(
     d_vertices.begin(),
-    [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
+    cuda::proclaim_return_type<vertex_t>(
+      [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));
 
   d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
   thrust::fill(
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index 6d847ae0bde..e4a367b46f0 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -28,6 +28,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 namespace cugraph {
 
 template <typename vertex_t>
@@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                   "Implementation cannot support specified value");
 
   auto random_iterator = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+    thrust::make_counting_iterator<size_t>(0),
+    cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
       thrust::default_random_engine rng(seed);
       thrust::uniform_real_distribution<float> dist(0.0, 1.0);
       rng.discard(index);
       return dist(rng);
-    });
+    }));
 
   size_t count = thrust::count_if(handle.get_thrust_policy(),
                                   random_iterator,
@@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                     indices_v.begin(),
                     indices_v.end(),
                     thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
-                    [num_vertices] __device__(size_t index) {
-                      size_t src = index / num_vertices;
-                      size_t dst = index % num_vertices;
-
-                      return thrust::make_tuple(static_cast<vertex_t>(src),
-                                                static_cast<vertex_t>(dst));
-                    });
+                    cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+                      [num_vertices] __device__(size_t index) {
+                        size_t src = index / num_vertices;
+                        size_t dst = index % num_vertices;
+
+                        return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                  static_cast<vertex_t>(dst));
+                      }));
 
   handle.sync_stream();
 
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index 6dba63909c3..f2102302008 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -27,6 +27,8 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cugraph {
@@ -264,23 +266,24 @@ generate_complete_graph_edgelist(
 
     auto transform_iter = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_t>(0),
-      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
-        size_t graph_index = index / (num_vertices * num_vertices);
-        size_t local_index = index % (num_vertices * num_vertices);
-
-        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
-        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
-
-        if (src == dst) {
-          src = invalid_vertex;
-          dst = invalid_vertex;
-        } else {
-          src += (graph_index * num_vertices);
-          dst += (graph_index * num_vertices);
-        }
-
-        return thrust::make_tuple(src, dst);
-      });
+      cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+        [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+          size_t graph_index = index / (num_vertices * num_vertices);
+          size_t local_index = index % (num_vertices * num_vertices);
+
+          vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+          vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+          if (src == dst) {
+            src = invalid_vertex;
+            dst = invalid_vertex;
+          } else {
+            src += (graph_index * num_vertices);
+            dst += (graph_index * num_vertices);
+          }
+
+          return thrust::make_tuple(src, dst);
+        }));
 
     output_iterator = thrust::copy_if(handle.get_thrust_policy(),
                                       transform_iter,
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index b238b964ede..219ce924b29 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -44,6 +44,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 
@@ -287,7 +289,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
       if (tmp_storage_bytes > d_tmp_storage.size()) {
         d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
@@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
 
       // copy the neighbor indices back to sample_nbr_indices
@@ -883,7 +887,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};        // relevant only when (minor_comm_size > 1)
+    std::nullopt};  // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 1349454f5b6..a14a276b98e 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -51,6 +51,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -944,16 +946,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                      minor_init);
         auto value_first = thrust::make_transform_iterator(
           view.value_first(),
-          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
-        thrust::scatter(
-          handle.get_thrust_policy(),
-          value_first + (*minor_key_offsets)[i],
-          value_first + (*minor_key_offsets)[i + 1],
-          thrust::make_transform_iterator(
-            (*(view.keys())).begin() + (*minor_key_offsets)[i],
-            [key_first = graph_view.vertex_partition_range_first(
-               this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_buffer_first);
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
         device_reduce(major_comm,
                       tx_buffer_first,
                       vertex_value_output_first,
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index 2d72a075ca5..c1c2c15ae15 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -44,6 +44,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scatter.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
@@ -181,13 +183,14 @@ void update_edge_major_property(raft::handle_t const& handle,
                        handle.get_stream());
           auto bool_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [rx_value_first,
-             v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) {
-              auto v_offset = v - v_first;
-              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                       packed_bool_mask(v_offset));
-            });
+            cuda::proclaim_return_type<bool>(
+              [rx_value_first,
+               v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) {
+                auto v_offset = v - v_first;
+                return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                         packed_bool_mask(v_offset));
+              }));
           pack_bools(handle,
                      bool_first,
                      bool_first + (*edge_partition_keys)[i].size(),
@@ -202,8 +205,9 @@ void update_edge_major_property(raft::handle_t const& handle,
 
           auto v_offset_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+            cuda::proclaim_return_type<vertex_t>(
+              [v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
           thrust::gather(handle.get_thrust_policy(),
                          v_offset_first,
                          v_offset_first + (*edge_partition_keys)[i].size(),
@@ -312,21 +316,24 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -391,9 +398,10 @@ void update_edge_major_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.major_offset_from_major_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -593,13 +601,14 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
                 auto bool_first = thrust::make_transform_iterator(
                   (*edge_partition_keys).begin() + key_offsets[j],
-                  [rx_value_first,
-                   v_first = graph_view.vertex_partition_range_first(
-                     minor_range_vertex_partition_id)] __device__(auto v) {
-                    auto v_offset = v - v_first;
-                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                             packed_bool_mask(v_offset));
-                  });
+                  cuda::proclaim_return_type<bool>(
+                    [rx_value_first,
+                     v_first = graph_view.vertex_partition_range_first(
+                       minor_range_vertex_partition_id)] __device__(auto v) {
+                      auto v_offset = v - v_first;
+                      return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                               packed_bool_mask(v_offset));
+                    }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -611,10 +620,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
                   std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
                 auto bool_first = thrust::make_transform_iterator(
                   thrust::make_counting_iterator(vertex_t{0}),
-                  [rx_value_first] __device__(vertex_t v_offset) {
+                  cuda::proclaim_return_type<bool>([rx_value_first] __device__(vertex_t v_offset) {
                     return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
                                              packed_bool_mask(v_offset));
-                  });
+                  }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -630,8 +639,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
               auto v_offset_first = thrust::make_transform_iterator(
                 (*edge_partition_keys).begin() + key_offsets[j],
-                [v_first = graph_view.vertex_partition_range_first(
-                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+                cuda::proclaim_return_type<vertex_t>(
+                  [v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
               thrust::gather(handle.get_thrust_policy(),
                              v_offset_first,
                              v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
@@ -718,21 +728,24 @@ void update_edge_minor_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -799,9 +812,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.minor_offset_from_minor_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 46789c6b8bd..5a9ded02009 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -52,6 +52,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <limits>
@@ -197,19 +199,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,       // in: column indices, given by stepper's random engine
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
-      d_v_col_indx.begin(),                                                        // input2
-      out_degs_,                                                                   // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -378,7 +380,8 @@ struct random_walker_t {
 
     // scatter d_src_init_v to coalesced vertex vector:
     //
-    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+    auto dlambda = cuda::proclaim_return_type<index_t>(
+      [stride = max_depth_] __device__(auto indx) { return indx * stride; });
 
     // use the transform iterator as map:
     //
@@ -539,10 +542,11 @@ struct random_walker_t {
 
     // delta = ptr_d_sizes[indx] - 1
     //
-    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - 1;
-      return ptr_d_coalesced[indx * stride + delta];
-    };
+    auto dlambda = cuda::proclaim_return_type<vertex_t>(
+      [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - 1;
+        return ptr_d_coalesced[indx * stride + delta];
+      });
 
     // use the transform iterator as map:
     //
@@ -575,9 +579,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,         // paths sizes used to provide delta in coalesced paths;
-                       // pre-condition: assumed as updated to reflect new vertex additions;
-                       // also, this is the number of _vertices_ in each path;
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -587,10 +591,11 @@ struct random_walker_t {
   {
     index_t const* ptr_d_sizes = original::raw_const_ptr(d_sizes);
 
-    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - adjust - 1;
-      return indx * stride + delta;
-    };
+    auto dlambda =
+      cuda::proclaim_return_type<index_t>([stride, adjust, ptr_d_sizes] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - adjust - 1;
+        return indx * stride + delta;
+      });
 
     // use the transform iterator as map:
     //
@@ -762,7 +767,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);         // coalesced vertex set
+                                                 stream);  // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 7626784c13c..867ec21d0a6 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -50,6 +50,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -155,7 +157,9 @@ rmm::device_uvector<edge_t> compute_major_degrees(
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_hypersparse_first - major_range_first),
                       local_degrees.begin(),
-                      [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
+                      cuda::proclaim_return_type<edge_t>([p_offsets] __device__(auto i) {
+                        return p_offsets[i + 1] - p_offsets[i];
+                      }));
     if (use_dcs) {
       auto p_dcs_nzd_vertices   = (*edge_partition_dcs_nzd_vertices)[i];
       auto dcs_nzd_vertex_count = (*edge_partition_dcs_nzd_vertex_counts)[i];
diff --git a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu b/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
index 96c8d6173e7..a1b782b34cf 100644
--- a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
+++ b/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
@@ -35,6 +35,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct RenumberSampledEdgelist_Usecase {
   size_t num_vertices{};
   size_t num_sampled_edges{};
@@ -380,16 +382,17 @@ class Tests_RenumberSampledEdgelist
 
           auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
             merged_vertices.begin(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            });
+            cuda::proclaim_return_type<vertex_t>(
+              [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                       sorted_org_vertices.size()),
+               matching_renumbered_vertices = raft::device_span<vertex_t const>(
+                 matching_renumbered_vertices.data(),
+                 matching_renumbered_vertices.size())] __device__(vertex_t src) {
+                auto it = thrust::lower_bound(
+                  thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
+                return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
+                                                                     it)];
+              }));
 
           thrust::reduce_by_key(handle.get_thrust_policy(),
                                 sort_key_first,

From aa6be2848c53b1d0bc0e3885d669d5f321c0f613 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 19 Sep 2023 09:59:33 -0700
Subject: [PATCH 2/7] clang-format

---
 cpp/include/cugraph/utilities/shuffle_comm.cuh |  2 +-
 cpp/src/community/detail/refine_impl.cuh       |  3 +--
 ...er_v_random_select_transform_outgoing_e.cuh |  4 ++--
 cpp/src/sampling/random_walks.cuh              | 18 +++++++++---------
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 22bc5b6b6f1..895a7a350e4 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -497,7 +497,7 @@ std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
     key_first,
     key_last,
     key_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type, KeyToGroupIdOp>{
-        key_to_group_id_op, pivot}));
+      key_to_group_id_op, pivot}));
   auto second_size  = num_elements - first_size;
 
   auto tmp_key_buffer =
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 2e615064286..c3792620a1e 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -53,8 +53,7 @@ CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
 namespace cuco {
 template <>
-struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {
-};
+struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {};
 }  // namespace cuco
 
 namespace cugraph {
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 219ce924b29..3a065e10e49 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -289,7 +289,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -887,7 +887,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};  // relevant only when (minor_comm_size > 1)
+    std::nullopt};        // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 5a9ded02009..f86c5b4b66e 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -199,19 +199,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,  // in: column indices, given by stepper's random engine
+      d_v_col_indx,       // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
-      d_v_col_indx.begin(),                                 // input2
-      out_degs_,                                            // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
+      d_v_col_indx.begin(),                                                        // input2
+      out_degs_,                                                                   // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -579,9 +579,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,  // paths sizes used to provide delta in coalesced paths;
-                // pre-condition: assumed as updated to reflect new vertex additions;
-                // also, this is the number of _vertices_ in each path;
+      d_sizes,         // paths sizes used to provide delta in coalesced paths;
+                       // pre-condition: assumed as updated to reflect new vertex additions;
+                       // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -767,7 +767,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);  // coalesced vertex set
+                                                 stream);         // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes

From a1fbc2297ccb8356f6001d05e0ae17a9f945fd48 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 19 Sep 2023 10:05:07 -0700
Subject: [PATCH 3/7] copyright

---
 cpp/src/generators/erdos_renyi_generator.cu | 2 +-
 cpp/src/generators/simple_generators.cu     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index e4a367b46f0..8448eeaf960 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index f2102302008..65647be5de0 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 4539da1863551ecd4300dcd3ea0212b55f8e8726 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Dec 2023 20:38:48 -0600
Subject: [PATCH 4/7] clang-format.

---
 cpp/src/community/detail/refine_impl.cuh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index e18825ff563..927f8ece9c5 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -246,12 +246,13 @@ refine_clustering(
     wcut_deg_and_cluster_vol_triple_begin,
     wcut_deg_and_cluster_vol_triple_end,
     singleton_and_connected_flags.begin(),
-    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
-      auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
-      auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
-      auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-      return wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-    }));
+    cuda::proclaim_return_type<uint8_t>(
+      [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
+        auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
+        auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
+        auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
+        return wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
+      }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);

From 8ad9d3b803eebb46b5a98915c9181b8a0f758b7d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Dec 2023 21:41:24 -0600
Subject: [PATCH 5/7] Add cuda::proclaim_return_type to refine_impl.cuh and
 sampling_post_processing_impl.cuh.

---
 cpp/src/community/detail/refine_impl.cuh      | 41 ++++++++++---------
 .../sampling_post_processing_impl.cuh         | 12 ++++--
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 927f8ece9c5..eb874657f01 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -215,16 +215,17 @@ refine_clustering(
                                 : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                     louvain_assignment_of_vertices.data(), vertex_t{0}),
     *edge_weight_view,
-    [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
-      weight_t weighted_cut_contribution{0};
+    cuda::proclaim_return_type<weight_t>(
+      [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
+        weight_t weighted_cut_contribution{0};
 
-      if (src == dst)  // self loop
-        weighted_cut_contribution = 0;
-      else if (src_cluster == dst_cluster)
-        weighted_cut_contribution = wt;
+        if (src == dst)  // self loop
+          weighted_cut_contribution = 0;
+        else if (src_cluster == dst_cluster)
+          weighted_cut_contribution = wt;
 
-      return weighted_cut_contribution;
-    },
+        return weighted_cut_contribution;
+      }),
     weight_t{0},
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
@@ -241,18 +242,18 @@ refine_clustering(
                                                  weighted_degree_of_vertices.end(),
                                                  vertex_louvain_cluster_weights.end()));
 
-  thrust::transform(
-    handle.get_thrust_policy(),
-    wcut_deg_and_cluster_vol_triple_begin,
-    wcut_deg_and_cluster_vol_triple_end,
-    singleton_and_connected_flags.begin(),
-    cuda::proclaim_return_type<uint8_t>(
-      [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
-        auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
-        auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
-        auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-        return wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-      }));
+  thrust::transform(handle.get_thrust_policy(),
+                    wcut_deg_and_cluster_vol_triple_begin,
+                    wcut_deg_and_cluster_vol_triple_end,
+                    singleton_and_connected_flags.begin(),
+                    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
+                                                          auto wcut_wdeg_and_louvain_volume) {
+                      auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
+                      auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
+                      auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
+                      return static_cast<uint8_t>(
+                        wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
+                    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 77d4f2d865f..852d82e78ab 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 
 namespace cugraph {
@@ -1229,10 +1231,12 @@ renumber_and_compress_sampled_edgelist(
           auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                       (*compressed_hops).begin());
           auto value_pair_first = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
-              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
-                                        static_cast<int32_t>(i % num_hops));
-            });
+            thrust::make_counting_iterator(size_t{0}),
+            cuda::proclaim_return_type<thrust::tuple<label_index_t, int32_t>>(
+              [num_hops] __device__(size_t i) {
+                return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                          static_cast<int32_t>(i % num_hops));
+              }));
           thrust::upper_bound(handle.get_thrust_policy(),
                               pair_first,
                               pair_first + (*compressed_label_indices).size(),

From d0507a758084b6a100790a8f7b7de814a668fbc2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Dec 2023 11:47:55 -0600
Subject: [PATCH 6/7] Apply cuda::proclaim_return_type to vertex_result.cu and
 sampling_post_processing_test.cu.

---
 cpp/src/mtmg/vertex_result.cu                       | 10 ++++++----
 cpp/tests/sampling/sampling_post_processing_test.cu | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 5b1825656ff..414f1bdfa88 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -21,6 +21,7 @@
 
 #include <detail/graph_partition_utils.cuh>
 
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 
 namespace cugraph {
@@ -91,10 +92,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   auto vertex_partition =
     vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
-  auto iter =
-    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+  auto iter = thrust::make_transform_iterator(
+    local_vertices.begin(),
+    cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
       return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-    });
+    }));
 
   thrust::gather(handle.get_thrust_policy(),
                  iter,
@@ -111,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       vertex_gpu_ids.begin(),
       vertex_gpu_ids.end(),
       thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
-      [] __device__(int gpu) { return gpu; },
+      thrust::identity{},
       handle.get_stream());
 
   //
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index e5267d75ac2..42131f876be 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -38,6 +38,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct SamplingPostProcessing_Usecase {
   size_t num_labels{};
   size_t num_seeds_per_label{};
@@ -318,7 +320,7 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      [sorted_org_vertices =
+      cuda::proclaim_return_type<vertex_t>([sorted_org_vertices =
          raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
        matching_renumbered_vertices = raft::device_span<vertex_t const>(
          matching_renumbered_vertices.data(),
@@ -326,7 +328,7 @@ bool check_renumber_map_invariants(
         auto it = thrust::lower_bound(
           thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
         return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      });
+      }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1020,7 +1022,7 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+              cuda::proclaim_return_type<vertex_t>([offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
                nzd_vertices =
                  renumbered_and_compressed_nzd_vertices
                    ? thrust::make_optional<raft::device_span<vertex_t const>>(
@@ -1036,7 +1038,7 @@ class Tests_SamplingPostProcessing
                 } else {
                   return base_v + static_cast<vertex_t>(idx);
                 }
-              });
+              }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),

From 31d4be8febdea3c6b001955f0f29ce4aa7ed384e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Dec 2023 12:15:57 -0600
Subject: [PATCH 7/7] clang-format.

---
 .../sampling/sampling_post_processing_test.cu | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index 42131f876be..6be735c3482 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -320,15 +320,16 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      cuda::proclaim_return_type<vertex_t>([sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      }));
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1022,23 +1023,24 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              cuda::proclaim_return_type<vertex_t>([offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-               nzd_vertices =
-                 renumbered_and_compressed_nzd_vertices
-                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                       (offset_end_offset - offset_start_offset) - 1)
-                   : thrust::nullopt,
-               base_v] __device__(size_t i) {
-                auto idx = static_cast<size_t>(thrust::distance(
-                  offsets.begin() + 1,
-                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                if (nzd_vertices) {
-                  return (*nzd_vertices)[idx];
-                } else {
-                  return base_v + static_cast<vertex_t>(idx);
-                }
-              }));
+              cuda::proclaim_return_type<vertex_t>(
+                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                 nzd_vertices =
+                   renumbered_and_compressed_nzd_vertices
+                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                         (offset_end_offset - offset_start_offset) - 1)
+                     : thrust::nullopt,
+                 base_v] __device__(size_t i) {
+                  auto idx = static_cast<size_t>(thrust::distance(
+                    offsets.begin() + 1,
+                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                  if (nzd_vertices) {
+                    return (*nzd_vertices)[idx];
+                  } else {
+                    return base_v + static_cast<vertex_t>(idx);
+                  }
+                }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),