Fix Random Walks output format and handle missing weights. (#1567)

This PR tracks work on missing weights, #1566, by providing default `weight_t{1}` weights. And addresses output format changes, as follows: 1. offsets for vertex paths starting indices, instead of sizes; 2. set of pairs (offset, size) for edge (weight) paths; Example: for an edge path with offsets 0,3,3,5,... meaning 1st path has 3 edges, 2nd path has 0 edges (!), 3rd has 2 edges, etc.; the return is: (0,3), (3,0), (3,2), (5,...), ...; 3. The remaining output format stays the same (i.e., coalesced vertex sets, and coalesced weight sets); Authors: - Andrei Schaffer (https://github.com/aschaffer) Approvers: - Seunghwa Kang (https://github.com/seunghwak) URL: #1567
rapidsai · May 5, 2021 · 50b43f7 · 50b43f7
1 parent 79f0ed9
commit 50b43f7
Show file tree

Hide file tree

Showing 6 changed files with 524 additions and 92 deletions.
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
@@ -1265,11 +1265,19 @@ extract_ego(raft::handle_t const &handle,
  * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
  * @param num_paths = number(paths).
  * @param max_depth maximum length of RWs.
- * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
- * device_vec_t<index_t>> Triplet of coalesced RW paths, with corresponding edge weights for
- * each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed to
- * the Python layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes
- * is represented as a 1D array.
+ * @param use_padding (optional) specifies if return uses padded format (true), or coalesced
+ * (compressed) format; when padding is used the output is a matrix of vertex paths and a matrix of
+ * edges paths (weights); in this case the matrices are stored in row major order; the vertex path
+ * matrix is padded with `num_vertices` values and the weight matrix is padded with `0` values;
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>,
+ * rmm::device_uvector<index_t>> Triplet of either padded or coalesced RW paths; in the coalesced
+ * case (default), the return consists of corresponding vertex and edge weights for each, and
+ * corresponding path sizes. This is meant to minimize the number of DF's to be passed to the Python
+ * layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes is
+ * represented as a 1D contiguous array. In the padded case the return is a matrix of num_paths x
+ * max_depth vertex paths; and num_paths x (max_depth-1) edge (weight) paths, with an empty array of
+ * sizes. Note: if the graph is un-weighted the edge (weight) paths consists of `weight_t{1}`
+ * entries;
  */
 template <typename graph_t, typename index_t>
 std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
@@ -1279,7 +1287,8 @@ random_walks(raft::handle_t const &handle,
              graph_t const &graph,
              typename graph_t::vertex_type const *ptr_d_start,
              index_t num_paths,
-             index_t max_depth);
+             index_t max_depth,
+             bool use_padding = false);
 
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/utilities/path_retrieval.hpp b/cpp/include/utilities/path_retrieval.hpp
@@ -67,5 +67,21 @@ std::
                        index_t num_paths,
                        rmm::device_buffer &&d_coalesced_v,
                        rmm::device_buffer &&d_sizes);
+
+/**
+ * @brief returns additional RW information on vertex paths offsets and weight path sizes and
+ * offsets, for the coalesced case (the padded case does not need or provide this information)
+ *
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param num_paths number of paths.
+ * @param ptr_d_sizes sizes of vertex paths.
+ * @return tuple of (vertex_path_offsets, weight_path_sizes, weight_path_offsets), where offsets are
+ * exclusive scan of corresponding sizes.
+ */
+template <typename index_t>
+std::tuple<rmm::device_uvector<index_t>, rmm::device_uvector<index_t>, rmm::device_uvector<index_t>>
+query_rw_sizes_offsets(raft::handle_t const &handle, index_t num_paths, index_t const *ptr_d_sizes);
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cu b/cpp/src/sampling/random_walks.cu
@@ -30,23 +30,26 @@ template std::
                graph_view_t<int32_t, int32_t, float, false, false> const& gview,
                int32_t const* ptr_d_start,
                int32_t num_paths,
-               int32_t max_depth);
+               int32_t max_depth,
+               bool use_padding);
 
 template std::
   tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
   random_walks(raft::handle_t const& handle,
                graph_view_t<int32_t, int64_t, float, false, false> const& gview,
                int32_t const* ptr_d_start,
                int64_t num_paths,
-               int64_t max_depth);
+               int64_t max_depth,
+               bool use_padding);
 
 template std::
   tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
   random_walks(raft::handle_t const& handle,
                graph_view_t<int64_t, int64_t, float, false, false> const& gview,
                int64_t const* ptr_d_start,
                int64_t num_paths,
-               int64_t max_depth);
+               int64_t max_depth,
+               bool use_padding);
 //}
 //
 // SG FP64{
@@ -56,23 +59,27 @@ template std::
                graph_view_t<int32_t, int32_t, double, false, false> const& gview,
                int32_t const* ptr_d_start,
                int32_t num_paths,
-               int32_t max_depth);
+               int32_t max_depth,
+               bool use_padding);
 
 template std::
   tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
   random_walks(raft::handle_t const& handle,
                graph_view_t<int32_t, int64_t, double, false, false> const& gview,
                int32_t const* ptr_d_start,
                int64_t num_paths,
-               int64_t max_depth);
+               int64_t max_depth,
+               bool use_padding);
 
 template std::
   tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
   random_walks(raft::handle_t const& handle,
                graph_view_t<int64_t, int64_t, double, false, false> const& gview,
                int64_t const* ptr_d_start,
                int64_t num_paths,
-               int64_t max_depth);
+               int64_t max_depth,
+               bool use_padding);
+//}
 
 template std::
   tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
@@ -97,6 +104,16 @@ template std::
                        int64_t num_paths,
                        rmm::device_buffer&& d_coalesced_v,
                        rmm::device_buffer&& d_sizes);
-//}
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, int32_t num_paths, int32_t const* ptr_d_sizes);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, int64_t num_paths, int64_t const* ptr_d_sizes);
+
 }  // namespace experimental
 }  // namespace cugraph