rapidsai · rapids-bot · May 13, 2022 · Apr 30, 2022 · May 9, 2022 · May 10, 2022
@@ -188,7 +188,11 @@ add_library(cugraph
     src/sampling/neighborhood.cu
     src/sampling/random_walks.cu
     src/sampling/detail/gather_utils_impl.cu
+    src/sampling/detail/sampling_utils_mg.cu
+    src/sampling/detail/sampling_utils_sg.cu
     src/sampling/nbr_sampling_mg.cu
+    src/sampling/uniform_neighbor_sampling_mg.cpp
+    src/sampling/uniform_neighbor_sampling_sg.cpp
     src/cores/legacy/core_number.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu
@@ -213,8 +217,7 @@ add_library(cugraph
     src/structure/relabel_sg.cu
     src/structure/relabel_mg.cu
     src/structure/induced_subgraph_sg.cu
-    ## FIXME: Not currently supported
-    ##src/structure/induced_subgraph_mg.cu
+    src/structure/induced_subgraph_mg.cu
     src/traversal/extract_bfs_paths_sg.cu
     src/traversal/extract_bfs_paths_mg.cu
     src/traversal/bfs_sg.cu

@@ -1503,6 +1503,7 @@ void core_number(raft::handle_t const& handle,
 
 /**
  * @brief Multi-GPU Uniform Neighborhood Sampling.
+ * @deprecated will be removed later in this release (22.06)
  *
  * @tparam graph_view_t Type of graph view.
  * @tparam gpu_t Type of rank (GPU) indices;
@@ -1536,6 +1537,32 @@ uniform_nbr_sample(raft::handle_t const& handle,
                    std::vector<int> const& h_fan_out,
                    bool with_replacement = true);
 
+/**
+ * @brief Multi-GPU Uniform Neighborhood Sampling.
+ *
+ * @tparam graph_view_t Type of graph view.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param d_starting_vertices Device span of starting vertex IDs for the NBR Sampling.
+ * @param h_fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level
+ * @param with_replacement boolean flag specifying if random sampling is done with replacement
+ * (true); or, without replacement (false); default = true;
+ * @return tuple of tuple of device vectors and counts:
+ * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts)
+ */
+template <typename graph_view_t>
+std::tuple<rmm::device_uvector<typename graph_view_t::vertex_type>,
+           rmm::device_uvector<typename graph_view_t::vertex_type>,
+           rmm::device_uvector<typename graph_view_t::weight_type>>
+uniform_nbr_sample(raft::handle_t const& handle,
+                   graph_view_t const& graph_view,
+                   raft::device_span<typename graph_view_t::vertex_type> d_starting_vertices,
+                   raft::host_span<const int> h_fan_out,
+                   bool with_replacement = true,
+                   uint64_t seed         = 0);
+
 /*
  * @brief Compute triangle counts.
  *

@@ -188,6 +188,7 @@ __global__ void partially_decompress_to_edgelist_high_degree(
   vertex_t input_major_count,
   vertex_t* output_majors,
   vertex_t* output_minors,
+  thrust::optional<weight_t*> output_weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -204,6 +205,8 @@ __global__ void partially_decompress_to_edgelist_high_degree(
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
       output_majors[major_offset + i] = major;
       output_minors[major_offset + i] = indices[i];
+
+      if (output_weights) (*output_weights)[major_offset + i] = (*weights)[i];
     }
     if (property) {
       auto input_property     = thrust::get<0>(*property)[idx];
@@ -231,6 +234,7 @@ __global__ void partially_decompress_to_edgelist_mid_degree(
   vertex_t input_major_count,
   vertex_t* output_majors,
   vertex_t* output_minors,
+  thrust::optional<weight_t*> output_weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -242,11 +246,18 @@ __global__ void partially_decompress_to_edgelist_mid_degree(
     auto major                  = input_majors[idx];
     auto major_partition_offset = static_cast<size_t>(major - edge_partition.major_range_first());
     vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{thrust::nullopt};
     edge_t local_degree{};
+
+    thrust::tie(indices, weights, local_degree) =
+      edge_partition.local_edges(major_partition_offset);
+
     auto major_offset = input_major_start_offsets[idx];
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
       output_majors[major_offset + i] = major;
       output_minors[major_offset + i] = indices[i];
+
+      if (output_weights) (*output_weights)[major_offset + i] = (*weights)[i];
     }
     if (property) {
       auto input_property     = thrust::get<0>(*property)[idx];
@@ -275,6 +286,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
   std::vector<vertex_t> const& segment_offsets,
   vertex_t* majors,
   vertex_t* minors,
+  thrust::optional<weight_t*> weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -297,6 +309,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
       segment_offsets[1],
       majors,
       minors,
+      weights,
       property ? thrust::make_optional(thrust::make_tuple(
                    thrust::get<0>(*property) + segment_offsets[0], thrust::get<1>(*property)))
                : thrust::nullopt,
@@ -317,6 +330,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
       segment_offsets[2] - segment_offsets[1],
       majors,
       minors,
+      weights,
       property ? thrust::make_optional(thrust::make_tuple(
                    thrust::get<0>(*property) + segment_offsets[1], thrust::get<1>(*property)))
                : thrust::nullopt,
@@ -333,10 +347,11 @@ void partially_decompress_edge_partition_to_fill_edgelist(
          input_major_start_offsets + segment_offsets[2] - segment_offsets[0],
        majors,
        minors,
-       property = property
-                    ? thrust::make_optional(thrust::make_tuple(
+       output_weights = weights,
+       property       = property
+                          ? thrust::make_optional(thrust::make_tuple(
                         thrust::get<0>(*property) + segment_offsets[2], thrust::get<1>(*property)))
-                    : thrust::nullopt,
+                          : thrust::nullopt,
        global_edge_index] __device__(auto idx) {
         auto major        = input_majors[idx];
         auto major_offset = input_major_start_offsets[idx];
@@ -350,6 +365,10 @@ void partially_decompress_edge_partition_to_fill_edgelist(
         thrust::fill(
           thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
         thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+        if (weights)
+          thrust::copy(
+            thrust::seq, *weights, *weights + local_degree, *output_weights + major_offset);
+
         if (property) {
           auto major_input_property  = thrust::get<0>(*property)[idx];
           auto minor_output_property = thrust::get<1>(*property);
@@ -379,10 +398,11 @@ void partially_decompress_edge_partition_to_fill_edgelist(
          input_major_start_offsets + segment_offsets[3] - segment_offsets[0],
        majors,
        minors,
-       property = property
-                    ? thrust::make_optional(thrust::make_tuple(
+       output_weights = weights,
+       property       = property
+                          ? thrust::make_optional(thrust::make_tuple(
                         thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
-                    : thrust::nullopt,
+                          : thrust::nullopt,
        global_edge_index] __device__(auto idx) {
         auto major        = input_majors[idx];
         auto major_offset = input_major_start_offsets[idx];
@@ -395,6 +415,9 @@ void partially_decompress_edge_partition_to_fill_edgelist(
           thrust::fill(
             thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
           thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+          if (output_weights)
+            thrust::copy(
+              thrust::seq, *weights, *weights + local_degree, *output_weights + major_offset);
           if (property) {
             auto major_input_property  = thrust::get<0>(*property)[idx];
             auto minor_output_property = thrust::get<1>(*property);

@@ -37,8 +37,8 @@
 #include <vector>
 
 namespace cugraph {
-
 namespace detail {
+namespace original {
 
 /**
  * @brief Compute local out degrees of the majors belonging to the adjacency matrices
@@ -238,6 +238,6 @@ gather_one_hop_edgelist(
   const rmm::device_uvector<prop_t>& active_major_property,
   const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets);
 
+}  // namespace original
 }  // namespace detail
-
 }  // namespace cugraph
@@ -25,6 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuco/detail/hash_functions.cuh>
+#include <thrust/binary_search.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
@@ -47,6 +48,22 @@ struct compute_gpu_id_from_vertex_t {
   }
 };
 
+template <typename vertex_t>
+struct compute_gpu_id_from_int_vertex_t {
+  vertex_t const* vertex_partition_range_lasts;
+  size_t num_vertex_partitions;
+
+  __device__ int operator()(vertex_t v) const
+  {
+    return static_cast<int>(
+      thrust::distance(vertex_partition_range_lasts,
+                       thrust::upper_bound(thrust::seq,
+                                           vertex_partition_range_lasts,
+                                           vertex_partition_range_lasts + num_vertex_partitions,
+                                           v)));
+  }
+};
+
 template <typename vertex_t>
 struct compute_gpu_id_from_edge_t {
   int comm_size{0};

@@ -62,6 +62,23 @@ template <typename vertex_t>
 rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(
   raft::handle_t const& handle, rmm::device_uvector<vertex_t>&& d_vertices);
 
+/**
+ * @brief Shuffle vertices using internal vertex ids
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * @param[in] d_vertices Vertex IDs to shuffle
+ * @param[in] vertex_partition_range_lasts From graph view, vector of last vertex id for each gpu
+ *
+ * @return device vector of shuffled vertices
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_int_vertices_by_gpu_id(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& d_vertices,
+  std::vector<vertex_t> const& vertex_partition_range_lasts);
+
 /**
  * @brief Groupby and count edgelist using the key function which returns the target local partition
  * ID for an edge.

@@ -122,5 +122,22 @@ vertex_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
     stream_view, d_edgelist_srcs.data(), d_edgelist_dsts.data(), d_edgelist_srcs.size());
 }
 
+/**
+ * @brief Filter zero degree vertices from this frontier
+ *
+ * @tparam vertex_t vertex type
+ * @tparam vertex_t edge type
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param d_vertices The input list of vertices
+ * @param d_out_degs The output degree of each vertex
+ * @return A tuple of device vectors the updated list of vertices and output degrees
+ */
+template <typename vertex_t, typename edge_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> filter_degree_0_vertices(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& d_vertices,
+  rmm::device_uvector<edge_t>&& d_out_degs);
+
 }  // namespace detail
 }  // namespace cugraph
@@ -202,11 +202,6 @@ struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::a
     if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
     } else {
-#if 1
-      unsupported();
-#else
-      // IMPLEMENTATION WILL GO HERE
-
       // uniform_nbr_sample expects store_transposed == false
       if constexpr (store_transposed) {
         error_code_ = cugraph::c_api::
@@ -238,14 +233,12 @@ struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::a
         graph_view.local_vertex_partition_range_last(),
         false);
 
-      // C++ API wants an std::vector
-      std::vector<int> fan_out(fan_out_->size_);
-      std::copy_n(fan_out_->as_type<int>(), fan_out_->size_, fan_out.data());
-
-      auto&& [tmp_tuple, counts] = cugraph::uniform_nbr_sample(
-        handle_, graph_view, start.data(), start.size(), fan_out, with_replacement_);
-
-      auto&& [srcs, dsts, labels, indices] = tmp_tuple;
+      auto&& [srcs, dsts, weights] = cugraph::uniform_nbr_sample(
+        handle_,
+        graph_view,
+        raft::device_span<vertex_t>(start.data(), start.size()),
+        raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+        with_replacement_);
 
       std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
 
@@ -268,9 +261,8 @@ struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::a
         new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_),
         nullptr,
-        new cugraph::c_api::cugraph_type_erased_device_array_t(indices, graph_->edge_type_),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(weights, graph_->weight_type_),
         nullptr};
-#endif
     }
   }
 };