MNMG Neighborhood Sampling (#2073)

This PR tracks work on MNMG Neighborhood Sampling, for G(C)NN needs. Dependencies: 1. #1982 2. #2064 3. Integration of rapidsai/cugraph-ops#24 into `cugraph` Authors: - Andrei Schaffer (https://github.com/aschaffer) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Kumar Aatish (https://github.com/kaatish) - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: #2073
rapidsai · Mar 17, 2022 · 9732e5e · 9732e5e
1 parent 08ab284
commit 9732e5e
Show file tree

Hide file tree

Showing 13 changed files with 1,640 additions and 340 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -184,6 +184,7 @@ add_library(cugraph SHARED
     src/sampling/neighborhood.cu
     src/sampling/random_walks.cu
     src/sampling/detail/gather_utils_impl.cu
+    src/sampling/nbr_sampling_mg.cu
     src/cores/legacy/core_number.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
@@ -1538,4 +1538,39 @@ void core_number(raft::handle_t const& handle,
                  size_t k_last           = std::numeric_limits<size_t>::max(),
                  bool do_expensive_check = false);
 
+/**
+ * @brief Multi-GPU Uniform Neighborhood Sampling.
+ *
+ * @tparam graph_view_t Type of graph view.
+ * @tparam gpu_t Type of rank (GPU) indices;
+ * @tparam index_t Type used for indexing; typically edge_t
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param ptr_d_starting_vertices Device array of starting vertex IDs for the NBR Sampling.
+ * @param ptr_d_ranks Device array of: rank IDs (GPU IDs) for the NBR Sampling.
+ * @param num_starting_vertices size of starting vertex set
+ * @param h_fan_out vector of branching out (fan-out) degree per source vertex for each level
+ * parameter used for obtaining local out-degree information
+ * @param with_replacement boolean flag specifying if random sampling is done with replacement
+ * (true); or, without replacement (false); default = true;
+ * @return tuple of tuple of device vectors and counts:
+ * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts)
+ */
+template <typename graph_view_t,
+          typename gpu_t,
+          typename index_t = typename graph_view_t::edge_type>
+std::tuple<std::tuple<rmm::device_uvector<typename graph_view_t::vertex_type>,
+                      rmm::device_uvector<typename graph_view_t::vertex_type>,
+                      rmm::device_uvector<gpu_t>,
+                      rmm::device_uvector<index_t>>,
+           std::vector<size_t>>
+uniform_nbr_sample(raft::handle_t const& handle,
+                   graph_view_t const& graph_view,
+                   typename graph_view_t::vertex_type const* ptr_d_starting_vertices,
+                   gpu_t const* ptr_d_ranks,
+                   size_t num_starting_vertices,
+                   std::vector<int> const& h_fan_out,
+                   bool with_replacement = true);
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/detail/graph_functions.cuh b/cpp/include/cugraph/detail/graph_functions.cuh
@@ -158,32 +158,34 @@ partition_information(raft::handle_t const& handle, GraphViewType const& graph_v
  * Collect all the edges that are present in the adjacency lists on the current gpu
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam EdgeIndexIterator Type of the iterator for edge indices.
  * @tparam GPUIdIterator  Type of the iterator for gpu id identifiers.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param active_majors_in_row Device vector containing all the vertex id that are processed by
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in] graph_view Non-owning graph object.
+ * @param[in] active_majors_in_row Device vector containing all the vertex id that are processed by
  * gpus in the column communicator
- * @param active_major_gpu_ids Device vector containing the gpu id associated by every vertex
+ * @param[in] active_major_gpu_ids Device vector containing the gpu id associated by every vertex
  * present in active_majors_in_row
- * @param edge_index_first Iterator pointing to the first destination index
- * @param indices_per_source Number of indices supplied for every source in the range
+ * @param[in] minor_map Device vector of destination indices (modifiable in-place) corresponding to
+ * vertex IDs being returned
+ * @param[in] indices_per_source Number of indices supplied for every source in the range
  * [vertex_input_first, vertex_input_last)
- * @param global_degree_offset Global degree offset to local adjacency list for every source
+ * @param[in] global_degree_offset Global degree offset to local adjacency list for every source
  * represented by current gpu
- * @return A tuple of device vector containing the majors, minors and gpu_ids gathered locally
+ * @return A tuple of device vector containing the majors, minors, gpu_ids and indices gathered
+ * locally
  */
-template <typename GraphViewType, typename EdgeIndexIterator, typename gpu_t>
+template <typename GraphViewType, typename gpu_t>
 std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
            rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<gpu_t>>
+           rmm::device_uvector<gpu_t>,
+           rmm::device_uvector<typename GraphViewType::edge_type>>
 gather_local_edges(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors_in_row,
   const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
-  EdgeIndexIterator edge_index_first,
+  rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
   const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets);
 

diff --git a/cpp/src/sampling/detail/gather_utils_impl.cu b/cpp/src/sampling/detail/gather_utils_impl.cu
@@ -203,65 +203,77 @@ template std::tuple<
 partition_information(raft::handle_t const& handle,
                       graph_view_t<int64_t, int64_t, double, false, true> const& graph_view);
 
-template std::
-  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-                     const rmm::device_uvector<int32_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int32_t const* edge_index_first,
-                     int32_t indices_per_major,
-                     const rmm::device_uvector<int32_t>& global_degree_offsets);
-
-template std::
-  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-                     const rmm::device_uvector<int32_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int32_t const* edge_index_first,
-                     int32_t indices_per_major,
-                     const rmm::device_uvector<int32_t>& global_degree_offsets);
-
-template std::
-  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-                     const rmm::device_uvector<int32_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int64_t const* edge_index_first,
-                     int64_t indices_per_major,
-                     const rmm::device_uvector<int64_t>& global_degree_offsets);
-
-template std::
-  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-                     const rmm::device_uvector<int32_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int64_t const* edge_index_first,
-                     int64_t indices_per_major,
-                     const rmm::device_uvector<int64_t>& global_degree_offsets);
-
-template std::
-  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-                     const rmm::device_uvector<int64_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int64_t const* edge_index_first,
-                     int64_t indices_per_major,
-                     const rmm::device_uvector<int64_t>& global_degree_offsets);
-
-template std::
-  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
-  gather_local_edges(raft::handle_t const& handle,
-                     graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-                     const rmm::device_uvector<int64_t>& active_majors_in_row,
-                     const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                     int64_t const* edge_index_first,
-                     int64_t indices_per_major,
-                     const rmm::device_uvector<int64_t>& global_degree_offsets);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+                   const rmm::device_uvector<int32_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int32_t>&& minor_map,
+                   int32_t indices_per_major,
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
+                   const rmm::device_uvector<int32_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int32_t>&& minor_map,
+                   int32_t indices_per_major,
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int64_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
+                   const rmm::device_uvector<int32_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int64_t>&& minor_map,
+                   int64_t indices_per_major,
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int64_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
+                   const rmm::device_uvector<int32_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int64_t>&& minor_map,
+                   int64_t indices_per_major,
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int64_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
+                   const rmm::device_uvector<int64_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int64_t>&& minor_map,
+                   int64_t indices_per_major,
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int64_t>>
+gather_local_edges(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
+                   const rmm::device_uvector<int64_t>& active_majors_in_row,
+                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
+                   rmm::device_uvector<int64_t>&& minor_map,
+                   int64_t indices_per_major,
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 }  // namespace detail
 

diff --git a/cpp/src/sampling/detail/gather_utils_impl.cuh b/cpp/src/sampling/detail/gather_utils_impl.cuh
@@ -376,16 +376,17 @@ partition_information(raft::handle_t const& handle, GraphViewType const& graph_v
                          std::move(vc_offsets));
 }
 
-template <typename GraphViewType, typename EdgeIndexIterator, typename gpu_t>
+template <typename GraphViewType, typename gpu_t>
 std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
            rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<gpu_t>>
+           rmm::device_uvector<gpu_t>,
+           rmm::device_uvector<typename GraphViewType::edge_type>>
 gather_local_edges(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors_in_row,
   const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
-  EdgeIndexIterator edge_index_first,
+  rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
   const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets)
 {
@@ -405,7 +406,7 @@ gather_local_edges(
     handle.get_thrust_policy(),
     thrust::make_counting_iterator<size_t>(0),
     thrust::make_counting_iterator<size_t>(edge_count),
-    [edge_index_first,
+    [edge_index_first      = minor_map.cbegin(),
      active_majors         = active_majors_in_row.data(),
      active_major_gpu_ids  = active_major_gpu_ids.data(),
      id_begin              = id_begin.data(),
@@ -460,14 +461,16 @@ gather_local_edges(
       auto location        = location_in_segment + vertex_count_offsets[partition_id];
       auto g_degree_offset = global_degree_offsets[location];
       auto g_dst_index     = edge_index_first[index];
+
       if ((g_dst_index >= g_degree_offset) && (g_dst_index < g_degree_offset + local_out_degree)) {
         minors[index] = adjacency_list[g_dst_index - g_degree_offset];
       } else {
         minors[index] = invalid_vertex_id;
       }
     });
+
   auto input_iter = thrust::make_zip_iterator(
-    thrust::make_tuple(majors.begin(), minors.begin(), minor_gpu_ids.begin()));
+    thrust::make_tuple(majors.begin(), minors.begin(), minor_gpu_ids.begin(), minor_map.begin()));
 
   auto compacted_length = thrust::distance(
     input_iter,
@@ -480,7 +483,10 @@ gather_local_edges(
   majors.resize(compacted_length, handle.get_stream());
   minors.resize(compacted_length, handle.get_stream());
   minor_gpu_ids.resize(compacted_length, handle.get_stream());
-  return std::make_tuple(std::move(majors), std::move(minors), std::move(minor_gpu_ids));
+  minor_map.resize(compacted_length, handle.get_stream());
+
+  return std::make_tuple(
+    std::move(majors), std::move(minors), std::move(minor_gpu_ids), std::move(minor_map));
 }
 
 }  // namespace detail