fix merge conflict

rapidsai · May 16, 2022 · 3f1d7ef · 3f1d7ef
2 parents 8c89335 + 4a6263a
commit 3f1d7ef
Show file tree

Hide file tree

Showing 139 changed files with 4,236 additions and 880 deletions.
diff --git a/benchmarks/python_pytest_based/bench_algos.py b/benchmarks/python_pytest_based/bench_algos.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,7 +31,7 @@ def setFixtureParamNames(*args, **kwargs):
 
 import cugraph
 from cugraph.structure.number_map import NumberMap
-from cugraph.tests import utils
+from cugraph.testing import utils
 from cugraph.utilities.utils import is_device_version_less_than
 import rmm
 

diff --git a/benchmarks/python_pytest_based/params.py b/benchmarks/python_pytest_based/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,8 +13,7 @@
 
 import pytest
 
-from cugraph.tests.utils import genFixtureParamsProduct
-from cugraph.tests import utils
+from cugraph.testing import utils
 from pathlib import PurePath
 
 # FIXME: write and use mechanism described here for specifying datasets:
@@ -52,7 +51,7 @@
                  marks=[pytest.mark.poolallocator_off]),
 ]
 
-FIXTURE_PARAMS = genFixtureParamsProduct(
+FIXTURE_PARAMS = utils.genFixtureParamsProduct(
     (DIRECTED_DATASETS + UNDIRECTED_DATASETS, "ds"),
     (MANAGED_MEMORY, "mm"),
     (POOL_ALLOCATOR, "pa"))
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -38,6 +38,7 @@ export HOME=$WORKSPACE
 cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
+unset GIT_DESCRIBE_TAG
 
 # ucx-py version
 export UCX_PY_VERSION='0.26.*'
@@ -102,15 +103,15 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_logger "Build from source"
     $WORKSPACE/build.sh -v clean libcugraph pylibcugraph cugraph
 else
-    echo "Installing libcugraph-tests"
+    gpuci_logger "Installing libcugraph-tests"
     gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} libcugraph libcugraph_etl libcugraph-tests
 
-    gpuci_logger "Install the master version of dask and distributed"
-    pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
-
-    echo "Build pylibcugraph and cugraph..."
-    $WORKSPACE/build.sh pylibcugraph cugraph
+    gpuci_logger "Building and installing pylibcugraph and cugraph..."
+    export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
+    export VERSION_SUFFIX=""
+    gpuci_conda_retry build conda/recipes/pylibcugraph --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_ARTIFACT_PATH} --python=${PYTHON}
+    gpuci_conda_retry build conda/recipes/cugraph --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_ARTIFACT_PATH} --python=${PYTHON}
+    gpuci_mamba_retry install cugraph pylibcugraph -c ${CONDA_BLD_DIR} -c ${CONDA_ARTIFACT_PATH}
 fi
 
 ################################################################################

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -188,7 +188,11 @@ add_library(cugraph
     src/sampling/neighborhood.cu
     src/sampling/random_walks.cu
     src/sampling/detail/gather_utils_impl.cu
+    src/sampling/detail/sampling_utils_mg.cu
+    src/sampling/detail/sampling_utils_sg.cu
     src/sampling/nbr_sampling_mg.cu
+    src/sampling/uniform_neighbor_sampling_mg.cpp
+    src/sampling/uniform_neighbor_sampling_sg.cpp
     src/cores/legacy/core_number.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu
@@ -213,8 +217,7 @@ add_library(cugraph
     src/structure/relabel_sg.cu
     src/structure/relabel_mg.cu
     src/structure/induced_subgraph_sg.cu
-    ## FIXME: Not currently supported
-    ##src/structure/induced_subgraph_mg.cu
+    src/structure/induced_subgraph_mg.cu
     src/traversal/extract_bfs_paths_sg.cu
     src/traversal/extract_bfs_paths_mg.cu
     src/traversal/bfs_sg.cu

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
@@ -27,6 +27,7 @@
 #include <cugraph-ops/graph/sampling.hpp>
 
 #include <raft/handle.hpp>
+#include <raft/random/rng_state.hpp>
 #include <raft/span.hpp>
 
 namespace cugraph {
@@ -1389,7 +1390,7 @@ random_walks(raft::handle_t const& handle,
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param rng The Rng (stateful) instance holding pseudo-random number generator state.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
  * @param graph Graph (view )object to sub-sample.
  * @param ptr_d_start Device pointer to set of starting vertex indices for the sub-sampling.
  * @param num_start_vertices = number(vertices) to use for sub-sampling.
@@ -1404,7 +1405,7 @@ template <typename graph_t>
 std::tuple<rmm::device_uvector<typename graph_t::edge_type>,
            rmm::device_uvector<typename graph_t::vertex_type>>
 sample_neighbors_adjacency_list(raft::handle_t const& handle,
-                                ops::gnn::graph::Rng& rng,
+                                raft::random::RngState& rng_state,
                                 graph_t const& graph,
                                 typename graph_t::vertex_type const* ptr_d_start,
                                 size_t num_start_vertices,
@@ -1420,7 +1421,7 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param rng The Rng (stateful) instance holding pseudo-random number generator state.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
  * @param graph Graph (view )object to sub-sample.
  * @param ptr_d_start Device pointer to set of starting vertex indices for the sub-sampling.
  * @param num_start_vertices = number(vertices) to use for sub-sampling.
@@ -1435,7 +1436,7 @@ template <typename graph_t>
 std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
            rmm::device_uvector<typename graph_t::vertex_type>>
 sample_neighbors_edgelist(raft::handle_t const& handle,
-                          ops::gnn::graph::Rng& rng,
+                          raft::random::RngState& rng_state,
                           graph_t const& graph,
                           typename graph_t::vertex_type const* ptr_d_start,
                           size_t num_start_vertices,
@@ -1502,6 +1503,7 @@ void core_number(raft::handle_t const& handle,
 
 /**
  * @brief Multi-GPU Uniform Neighborhood Sampling.
+ * @deprecated will be removed later in this release (22.06)
  *
  * @tparam graph_view_t Type of graph view.
  * @tparam gpu_t Type of rank (GPU) indices;
@@ -1535,6 +1537,39 @@ uniform_nbr_sample(raft::handle_t const& handle,
                    std::vector<int> const& h_fan_out,
                    bool with_replacement = true);
 
+/**
+ * @brief Uniform Neighborhood Sampling.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param starting_vertices Device span of starting vertex IDs for the NBR Sampling.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level
+ * @param with_replacement boolean flag specifying if random sampling is done with replacement
+ * (true); or, without replacement (false); default = true;
+ * @param seed A seed to initialize the random number generator
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex, weight_t wgt)
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
+  uniform_nbr_sample(
+    raft::handle_t const& handle,
+    graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+    raft::device_span<vertex_t> starting_vertices,
+    raft::host_span<const int> fan_out,
+    bool with_replacement = true,
+    uint64_t seed         = 0);
+
 /*
  * @brief Compute triangle counts.
  *

diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -188,6 +188,7 @@ __global__ void partially_decompress_to_edgelist_high_degree(
   vertex_t input_major_count,
   vertex_t* output_majors,
   vertex_t* output_minors,
+  thrust::optional<weight_t*> output_weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -204,6 +205,8 @@ __global__ void partially_decompress_to_edgelist_high_degree(
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
       output_majors[major_offset + i] = major;
       output_minors[major_offset + i] = indices[i];
+
+      if (output_weights) (*output_weights)[major_offset + i] = (*weights)[i];
     }
     if (property) {
       auto input_property     = thrust::get<0>(*property)[idx];
@@ -231,6 +234,7 @@ __global__ void partially_decompress_to_edgelist_mid_degree(
   vertex_t input_major_count,
   vertex_t* output_majors,
   vertex_t* output_minors,
+  thrust::optional<weight_t*> output_weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -242,11 +246,18 @@ __global__ void partially_decompress_to_edgelist_mid_degree(
     auto major                  = input_majors[idx];
     auto major_partition_offset = static_cast<size_t>(major - edge_partition.major_range_first());
     vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{thrust::nullopt};
     edge_t local_degree{};
+
+    thrust::tie(indices, weights, local_degree) =
+      edge_partition.local_edges(major_partition_offset);
+
     auto major_offset = input_major_start_offsets[idx];
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
       output_majors[major_offset + i] = major;
       output_minors[major_offset + i] = indices[i];
+
+      if (output_weights) (*output_weights)[major_offset + i] = (*weights)[i];
     }
     if (property) {
       auto input_property     = thrust::get<0>(*property)[idx];
@@ -275,6 +286,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
   std::vector<vertex_t> const& segment_offsets,
   vertex_t* majors,
   vertex_t* minors,
+  thrust::optional<weight_t*> weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
   thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
 {
@@ -297,6 +309,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
       segment_offsets[1],
       majors,
       minors,
+      weights,
       property ? thrust::make_optional(thrust::make_tuple(
                    thrust::get<0>(*property) + segment_offsets[0], thrust::get<1>(*property)))
                : thrust::nullopt,
@@ -317,6 +330,7 @@ void partially_decompress_edge_partition_to_fill_edgelist(
       segment_offsets[2] - segment_offsets[1],
       majors,
       minors,
+      weights,
       property ? thrust::make_optional(thrust::make_tuple(
                    thrust::get<0>(*property) + segment_offsets[1], thrust::get<1>(*property)))
                : thrust::nullopt,
@@ -333,10 +347,11 @@ void partially_decompress_edge_partition_to_fill_edgelist(
          input_major_start_offsets + segment_offsets[2] - segment_offsets[0],
        majors,
        minors,
-       property = property
-                    ? thrust::make_optional(thrust::make_tuple(
+       output_weights = weights,
+       property       = property
+                          ? thrust::make_optional(thrust::make_tuple(
                         thrust::get<0>(*property) + segment_offsets[2], thrust::get<1>(*property)))
-                    : thrust::nullopt,
+                          : thrust::nullopt,
        global_edge_index] __device__(auto idx) {
         auto major        = input_majors[idx];
         auto major_offset = input_major_start_offsets[idx];
@@ -347,9 +362,16 @@ void partially_decompress_edge_partition_to_fill_edgelist(
         edge_t local_degree{};
         thrust::tie(indices, weights, local_degree) =
           edge_partition.local_edges(major_partition_offset);
+
+        // FIXME: This can lead to thread divergence if local_degree varies significantly
+        //        within threads in this warp
         thrust::fill(
           thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
         thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+        if (weights)
+          thrust::copy(
+            thrust::seq, *weights, *weights + local_degree, *output_weights + major_offset);
+
         if (property) {
           auto major_input_property  = thrust::get<0>(*property)[idx];
           auto minor_output_property = thrust::get<1>(*property);
@@ -379,10 +401,11 @@ void partially_decompress_edge_partition_to_fill_edgelist(
          input_major_start_offsets + segment_offsets[3] - segment_offsets[0],
        majors,
        minors,
-       property = property
-                    ? thrust::make_optional(thrust::make_tuple(
+       output_weights = weights,
+       property       = property
+                          ? thrust::make_optional(thrust::make_tuple(
                         thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
-                    : thrust::nullopt,
+                          : thrust::nullopt,
        global_edge_index] __device__(auto idx) {
         auto major        = input_majors[idx];
         auto major_offset = input_major_start_offsets[idx];
@@ -395,6 +418,9 @@ void partially_decompress_edge_partition_to_fill_edgelist(
           thrust::fill(
             thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
           thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+          if (output_weights)
+            thrust::copy(
+              thrust::seq, *weights, *weights + local_degree, *output_weights + major_offset);
           if (property) {
             auto major_input_property  = thrust::get<0>(*property)[idx];
             auto minor_output_property = thrust::get<1>(*property);

diff --git a/cpp/include/cugraph/detail/graph_functions.cuh b/cpp/include/cugraph/detail/graph_functions.cuh
@@ -37,8 +37,8 @@
 #include <vector>
 
 namespace cugraph {
-
 namespace detail {
+namespace original {
 
 /**
  * @brief Compute local out degrees of the majors belonging to the adjacency matrices
@@ -238,6 +238,6 @@ gather_one_hop_edgelist(
   const rmm::device_uvector<prop_t>& active_major_property,
   const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets);
 
+}  // namespace original
 }  // namespace detail
-
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh
@@ -21,10 +21,13 @@
 #include <cugraph/utilities/device_comm.cuh>
 
 #include <raft/handle.hpp>
+#include <raft/span.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuco/detail/hash_functions.cuh>
+#include <thrust/binary_search.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
@@ -37,7 +40,7 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-struct compute_gpu_id_from_vertex_t {
+struct compute_gpu_id_from_ext_vertex_t {
   int comm_size{0};
 
   __device__ int operator()(vertex_t v) const
@@ -47,6 +50,21 @@ struct compute_gpu_id_from_vertex_t {
   }
 };
 
+template <typename vertex_t>
+struct compute_gpu_id_from_int_vertex_t {
+  raft::device_span<vertex_t> vertex_partition_range_lasts_span;
+
+  __device__ int operator()(vertex_t v) const
+  {
+    return static_cast<int>(
+      thrust::distance(vertex_partition_range_lasts_span.begin(),
+                       thrust::upper_bound(thrust::seq,
+                                           vertex_partition_range_lasts_span.begin(),
+                                           vertex_partition_range_lasts_span.end(),
+                                           v)));
+  }
+};
+
 template <typename vertex_t>
 struct compute_gpu_id_from_edge_t {
   int comm_size{0};