Merge pull request rapidsai#297 from jwyles/snmg_degree

Snmg degree
oorliu · May 22, 2019 · d488887 · d488887
2 parents f2f6f91 + ce987f3
commit d488887
Show file tree

Hide file tree

Showing 12 changed files with 914 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## New Features
 - PR #287 SNMG power iteration step1 
+- PR #297 SNMG degree calculation 
 
 ## Improvements
 - PR #291 nvGraph is updated to use RMM instead of directly invoking cnmem functions.

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -242,6 +242,8 @@ add_library(cugraph SHARED
     src/traversal/nvgraph_sssp.cu
     src/traversal/two_hop_neighbors.cu
     src/snmg/blas/spmv.cu
+    src/snmg/degree/degree.cu
+    src/snmg/utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/test_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/error_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/misc_utils.cu

diff --git a/cpp/include/algorithms.h b/cpp/include/algorithms.h
@@ -170,3 +170,19 @@ gdf_error gdf_louvain(gdf_graph *graph,
                       void *final_modularity,
                       void *num_level,
                       gdf_column *louvain_parts);
+
+/**
+ * Computes the in-degree, out-degree, or the sum of both (determined by x) for the given graph. This is
+ * a multi-gpu operation operating on a partitioned graph.
+ * @param x 0 for in+out, 1 for in, 2 for out
+ * @param part_offsets Contains the start/end of each partitions vertex id range
+ * @param off The local partition offsets
+ * @param ind The local partition indices
+ * @param x_cols The results (located on each GPU)
+ * @return Error code
+ */
+gdf_error gdf_snmg_degree(int x,
+                          size_t* part_offsets,
+                          gdf_column* off,
+                          gdf_column* ind,
+                          gdf_column** x_cols);
diff --git a/cpp/src/snmg/degree/degree.cu b/cpp/src/snmg/degree/degree.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "degree.cuh"
+
+template<typename idx_t>
+gdf_error gdf_snmg_degree_impl(int x,
+                               size_t* part_offsets,
+                               gdf_column* off,
+                               gdf_column* ind,
+                               gdf_column** x_cols) {
+  GDF_REQUIRE(off->size > 0, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(ind->size > 0, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(off->null_count + ind->null_count == 0, GDF_VALIDITY_UNSUPPORTED);
+
+  gdf_error status;
+  auto p = omp_get_num_threads();
+
+  idx_t* degree[p];
+  for (auto i = 0; i < p; ++i) {
+    GDF_REQUIRE(x_cols[i] != nullptr, GDF_INVALID_API_CALL);
+    GDF_REQUIRE(x_cols[i]->size > 0, GDF_INVALID_API_CALL);
+    degree[i] = static_cast<idx_t*>(x_cols[i]->data);
+  }
+
+  status = cugraph::snmg_degree(x,
+                                part_offsets,
+                                static_cast<idx_t*>(off->data),
+                                static_cast<idx_t*>(ind->data),
+                                degree);
+  return status;
+}
+
+gdf_error gdf_snmg_degree(int x,
+                          size_t* part_offsets,
+                          gdf_column* off,
+                          gdf_column* ind,
+                          gdf_column** x_cols) {
+  GDF_REQUIRE(part_offsets != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(off != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(ind != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(x_cols != nullptr, GDF_INVALID_API_CALL);
+  switch (off->dtype) {
+    case GDF_INT32:
+      return gdf_snmg_degree_impl<int32_t>(x, part_offsets, off, ind, x_cols);
+    case GDF_INT64:
+      return gdf_snmg_degree_impl<int64_t>(x, part_offsets, off, ind, x_cols);
+    default:
+      return GDF_INVALID_API_CALL;
+  }
+}
diff --git a/cpp/src/snmg/degree/degree.cuh b/cpp/src/snmg/degree/degree.cuh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <omp.h>
+#include "utilities/graph_utils.cuh"
+#include "snmg/utils.cuh"
+#include "rmm_utils.h"
+
+namespace cugraph {
+  /**
+   * Single node multi-GPU method for degree calculation on a partitioned graph.
+   * @param x Indicates whether to compute in degree, out degree, or the sum of both.
+   *    0 = in + out degree
+   *    1 = in-degree
+   *    2 = out-degree
+   * @param part_off The vertex partitioning of the global graph
+   * @param off The offsets array of the local partition
+   * @param ind The indices array of the local partition
+   * @param degree Pointer to pointers to memory on each GPU for the result
+   * @return Error code
+   */
+  template<typename idx_t>
+  gdf_error snmg_degree(int x, size_t* part_off, idx_t* off, idx_t* ind, idx_t** degree) {
+    sync_all();
+    SNMGinfo env;
+    auto i = env.get_thread_num();
+    auto p = env.get_num_threads();
+
+    // Getting the global and local vertices and edges
+    size_t glob_v = part_off[p];
+    size_t loc_v = part_off[i + 1] - part_off[i];
+    idx_t tmp;
+    CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(idx_t), cudaMemcpyDeviceToHost));
+    size_t loc_e = tmp;
+
+    // Allocating the local result array, and setting all entries to zero.
+    idx_t* local_result;
+    ALLOC_TRY((void** )&local_result, glob_v * sizeof(idx_t), nullptr);
+    thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0);
+
+    // In-degree
+    if (x == 1 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<idx_t>(loc_e), static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<idx_t>((loc_e + nthreads.x - 1) / nthreads.x),
+                      static_cast<idx_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_coo<idx_t, idx_t> <<<nblocks, nthreads>>>(static_cast<idx_t>(loc_e),
+                                                       static_cast<idx_t>(loc_e),
+                                                       ind,
+                                                       local_result);
+      cudaCheckError();
+    }
+
+    // Out-degree
+    if (x == 2 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<idx_t>(loc_v), static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<idx_t>((loc_v + nthreads.x - 1) / nthreads.x),
+                      static_cast<idx_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_offsets<idx_t, idx_t> <<<nblocks, nthreads>>>(static_cast<idx_t>(loc_v),
+                                                           static_cast<idx_t>(loc_e),
+                                                           off,
+                                                           local_result + part_off[i]);
+      cudaCheckError();
+    }
+
+    // Combining the local results into global results
+    sync_all();
+    treeReduce<idx_t, thrust::plus<idx_t> >(env, glob_v, local_result, degree);
+
+    // Broadcasting the global result to all GPUs
+    treeBroadcast(env, glob_v, local_result, degree);
+
+    return GDF_SUCCESS;
+  }
+
+  template<>
+  gdf_error snmg_degree<int64_t>(int x,
+                                 size_t* part_off,
+                                 int64_t* off,
+                                 int64_t* ind,
+                                 int64_t** degree) {
+    sync_all();
+    SNMGinfo env;
+    auto i = env.get_thread_num();
+    auto p = env.get_num_threads();
+
+    // Getting the global and local vertices and edges
+    size_t glob_v = part_off[p];
+    size_t loc_v = part_off[i + 1] - part_off[i];
+    int64_t tmp;
+    CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(int64_t), cudaMemcpyDeviceToHost));
+    size_t loc_e = tmp;
+
+    // Allocating the local result array, and setting all entries to zero.
+    int64_t* local_result;
+    ALLOC_TRY((void** )&local_result, glob_v * sizeof(int64_t), nullptr);
+    thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0);
+
+    // In-degree
+    if (x == 1 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<int64_t>(loc_e), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<int64_t>((loc_e + nthreads.x - 1) / nthreads.x),
+                      static_cast<int64_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_coo<int64_t, double> <<<nblocks, nthreads>>>(static_cast<int64_t>(loc_e),
+                                                          static_cast<int64_t>(loc_e),
+                                                          ind,
+                                                          reinterpret_cast<double*>(local_result));
+      cudaCheckError();
+    }
+
+    // Out-degree
+    if (x == 2 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<int64_t>(loc_v), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<int64_t>((loc_v + nthreads.x - 1) / nthreads.x),
+                      static_cast<int64_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_offsets<int64_t, double> <<<nblocks, nthreads>>>(static_cast<int64_t>(loc_v),
+                                                              static_cast<int64_t>(loc_e),
+                                                              off,
+                                                              reinterpret_cast<double*>(local_result
+                                                                  + part_off[i]));
+      cudaCheckError();
+    }
+
+    // Convert the values written as doubles back to int64:
+    dim3 nthreads, nblocks;
+    nthreads.x = min(static_cast<int64_t>(glob_v), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min(static_cast<int64_t>((glob_v + nthreads.x - 1) / nthreads.x),
+                    static_cast<int64_t>(env.get_num_sm() * 32));
+    nblocks.y = 1;
+    nblocks.z = 1;
+    type_convert<double, int64_t> <<<nblocks, nthreads>>>(reinterpret_cast<double*>(local_result), glob_v);
+    cudaCheckError();
+
+    // Combining the local results into global results
+    treeReduce<int64_t, thrust::plus<int64_t> >(env, glob_v, local_result, degree);
+
+    // Broadcasting the global result to all GPUs
+    treeBroadcast(env, glob_v, local_result, degree);
+
+    return GDF_SUCCESS;
+  }
+}
diff --git a/cpp/src/snmg/link_analysis/pagerank.cuh b/cpp/src/snmg/link_analysis/pagerank.cuh
@@ -118,7 +118,7 @@ class SNMGpagerank
         // TODO snmg degree
         int nthreads = min(static_cast<IndexType>(e_loc), 256);
         int nblocks = min(static_cast<IndexType>(32*env.get_num_sm()), CUDA_MAX_BLOCKS);
-        degree_coo<IndexType, ValueType><<<nblocks, nthreads>>>(v_glob, e_loc, ind, degree);
+        degree_coo<IndexType, IndexType><<<nblocks, nthreads>>>(v_glob, e_loc, ind, degree);
 
         // Update dangling node vector
         fill(v_glob, bookmark, zero);

diff --git a/cpp/src/snmg/utils.cu b/cpp/src/snmg/utils.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <omp.h>
+#include <iostream>
+
+namespace cugraph {
+  void sync_all() {
+    cudaDeviceSynchronize();
+    #pragma omp barrier
+  }
+
+  void print_mem_usage() {
+    size_t free,total;
+    cudaMemGetInfo(&free, &total);
+    std::cout<< std::endl<< "Mem used: "<<total-free<<std::endl;
+  }
+}