From 012465e4568ff762c05cd2e4082240f3d80d917a Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 14 Apr 2020 17:19:11 -0500
Subject: [PATCH 01/19] Added NCCL_TRY macro for throwing throwing erros

---
 cpp/src/utilities/error_utils.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h
index 644c29b295a..f8342c680d7 100644
--- a/cpp/src/utilities/error_utils.h
+++ b/cpp/src/utilities/error_utils.h
@@ -50,6 +50,14 @@ struct logic_error : public std::logic_error {
 struct cuda_error : public std::runtime_error {
   cuda_error(std::string const& message) : std::runtime_error(message) {}
 };
+/**---------------------------------------------------------------------------*
+ * @brief Exception thrown when a NCCL error is encountered.
+ *
+ *---------------------------------------------------------------------------**/
+struct nccl_error : public std::runtime_error {
+  nccl_error(std::string const& message) : std::runtime_error(message) {}
+};
+
 }  // namespace cugraph
 
 #define STRINGIFY_DETAIL(x) #x
@@ -126,6 +134,13 @@ inline void throw_cuda_error(cudaError_t error, const char* file,
                   cudaGetErrorName(error) + " " + cudaGetErrorString(error)});
 }
 
+inline void throw_nccl_error(ncclResult_t error, const char* file,
+                             unsigned int line) {
+  throw cugraph::nccl_error(
+      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
+                  std::to_string(line) + ": " + ncclGetErrorString(error)});
+}
+
 inline void check_stream(cudaStream_t stream, const char* file,
                          unsigned int line) {
   cudaError_t error{cudaSuccess};
@@ -208,3 +223,13 @@ inline void check_stream(cudaStream_t stream, const char* file,
 #define CHECK_GRAPH(graph) \
   CUGRAPH_EXPECTS(graph != nullptr, "Invalid API parameter: graph is NULL"); \
   CUGRAPH_EXPECTS(graph->adjList != nullptr || graph->edgeList != nullptr, "Invalid API parameter: graph is empty");
+
+#define NCCL_TRY(cmd) {                                      \
+    ncclResult_t nccl_status = cmd;                           \
+    if (nccl_status!= ncclSuccess) {                          \
+      printf("NCCL failure %s:%d '%s'\n",                     \
+          __FILE__,__LINE__,ncclGetErrorString(nccl_status)); \
+      FAIL();                                                 \
+    }                                                         \
+  } 
+}
\ No newline at end of file

From 8aa34bcbffeb24b4f99e2b9bddc8d326c6b7559d Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 15 Apr 2020 17:11:38 -0500
Subject: [PATCH 02/19] wip comm

---
 cpp/src/comms/mpi/comms_mpi.hpp | 236 ++++++++++++++++++++++++++++++++
 cpp/src/structure/graph.cu      |  15 +-
 2 files changed, 246 insertions(+), 5 deletions(-)
 create mode 100644 cpp/src/comms/mpi/comms_mpi.hpp

diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
new file mode 100644
index 00000000000..983fd480ad7
--- /dev/null
+++ b/cpp/src/comms/mpi/comms_mpi.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// snmg utils
+// Author: Alex Fender afender@nvidia.com
+ 
+#pragma once
+#include <mpi.h>
+#include <nccl.h>
+#include <omp.h>
+#include <unistd.h>
+#include <vector>
+#include "mem_utils.h"
+#include "basic_kernels.cuh"
+
+#define USE_NCCL 1
+
+namespace cugraph { 
+namespace opg {
+
+template <typename value_t>
+constexpr MPI_Datatype get_mpi_type() {
+  if (std::is_integral<value_t>::value) {
+    if (std::is_signed<value_t>::value) {
+      if (sizeof(value_t) == 1) {
+        return MPI_INT8_T;
+      }
+      else if (sizeof(value_t) == 2) {
+        return MPI_INT16_T;
+      }
+      else if (sizeof(value_t) == 4) {
+        return MPI_INT32_T;
+      }
+      else if (sizeof(value_t) == 8) {
+        return MPI_INT64_T;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+    else {
+      if (sizeof(value_t) == 1) {
+        return MPI_UINT8_T;
+      }
+      else if (sizeof(value_t) == 2) {
+        return MPI_UINT16_T;
+      }
+      else if (sizeof(value_t) == 4) {
+        return MPI_UINT32_T;
+      }
+      else if (sizeof(value_t) == 8) {
+        return MPI_UINT64_T;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+  }
+  else if(std::is_same<value_t, float>::value) {
+    return MPI_FLOAT;
+  }
+  else if(std::is_same<value_t, double>::value) {
+    return MPI_DOUBLE;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+#if USE_NCCL
+template <typename value_t>
+constexpr ncclDataType_t get_nccl_type() {
+  if (std::is_integral<value_t>::value) {
+    if (std::is_signed<value_t>::value) {
+      if (sizeof(value_t) == 1) {
+        return ncclInt8;
+      }
+      else if (sizeof(value_t) == 4) {
+        return ncclInt32;
+      }
+      else if (sizeof(value_t) == 8) {
+        return ncclInt64;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+    else {
+      if (sizeof(value_t) == 1) {
+        return ncclUint8;
+      }
+      else if (sizeof(value_t) == 4) {
+        return ncclUint32;
+      }
+      else if (sizeof(value_t) == 8) {
+        return ncclUint64;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+  }
+  else if(std::is_same<value_t, float>::value) {
+    return ncclFloat32;
+  }
+  else if(std::is_same<value_t, double>::value) {
+    return ncclFloat64;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+#endif
+enum class ReduceOp { SUM, MAX, MIN };
+
+constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
+  if (reduce_op == ReduceOp::SUM) {
+    return MPI_SUM;
+  }
+  else if (reduce_op == ReduceOp::MAX) {
+    return MPI_MAX;
+  }
+  else if (reduce_op == ReduceOp::MIN) {
+    return MPI_MIN;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+
+#if USE_NCCL
+constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
+  if (reduce_op == ReduceOp::SUM) {
+    return ncclSum;
+  }
+  else if (reduce_op == ReduceOp::MAX) {
+    return ncclMax;
+  }
+  else if (reduce_op == ReduceOp::MIN) {
+    return ncclMin;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+#endif
+
+// basic info about the snmg env setup
+class Comm 
+{ 
+  private:
+  int _p_x{0};
+  int _p_y{0};
+
+  int _mpi_world_rank{0};
+  int _mpi_world_size{0};
+  bool _finalize_mpi{false};
+
+  int _device_id{0};
+  int _device_count{0};
+
+  std::vector<void*> _p_ipc_mems{};
+  std::vector<size_t> _local_ipc_mem_offsets{};
+
+  int _sm_count_per_device{0};
+  int _max_grid_dim_1D{0};
+  int _max_block_dim_1D{0};
+  int _l2_cache_size{0};
+  int _shared_memory_size_per_sm{0};
+  int _cuda_stream_least_priority{0};
+  int _cuda_stream_greatest_priority{0};
+
+  MPI_Comm _mpi_comm_p_x{};
+  MPI_Comm _mpi_comm_p_y{};
+  MPI_Comm _mpi_comm_p{};
+
+  cudaStream_t _default_stream{};
+  std::vector<cudaStream_t> _extra_streams{};
+
+  ncclComm_t _nccl_comm{};
+  
+  public: 
+    Comm();
+    ~Comm();
+    int get_rank() const { return _mpi_world_rank; }
+    int get_p() const { return _mpi_world_size; }
+    int get_dev() const { return _device_id; }
+    int get_dev_count() const { return _device_count; }
+    int get_sm_count() const { return _sm_count_per_device; }
+    bool is_master() const return { return (_mpi_world_rank == 0)? true : false; }
+    void init();
+
+    template <typename val_t>
+    void allgather (size_t size, val_t* sendbuff, val_t* recvbuff);
+
+    template <typename val_t>
+    void allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op);
+
+};
+
+// Wait for all host threads 
+void sync_all() {
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+}
+
+template <typename val_t>
+void Comm::allgather (size_t size, val_t* sendbuff, val_t* recvbuff) {
+#if USE_NCCL
+  if(typeid(val_t) == typeid(float))
+    NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
+  else 
+    CUGRAPH_FAIL("allgather needs floats");
+#endif
+}
+
+template <typename val_t>
+void Comm::allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op) {
+#if USE_NCCL
+    NCCL_TRY(ncclAllReduce(const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)););
+#endif
+}
+
+} } //namespace
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 883b35041c4..a8d7082f0ca 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -32,10 +32,12 @@ void degree_from_offsets(vertex_t number_of_vertices,
 }
 
 template <typename vertex_t, typename edge_t>
-void degree_from_vertex_ids(edge_t number_of_edges,
+void degree_from_vertex_ids(vertex_t number_of_vertices,
+                            edge_t number_of_edges,
                             vertex_t const *indices,
                             edge_t *degree,
-                            cudaStream_t stream) {
+                            cudaStream_t stream,
+                            cugraph::Comm env = 0) {
 
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    thrust::make_counting_iterator<edge_t>(0),
@@ -43,6 +45,9 @@ void degree_from_vertex_ids(edge_t number_of_edges,
                    [indices, degree] __device__ (edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
+  comm.allreduce(cugraph::Communicator::P_X, cugraph::Target::DEVICE,
+                degree, degree, d_out_degrees.size(),
+                cugraph::ReduceOp::SUM, env.get_default_cuda_stream());
 }
 
 } //namespace anonymous
@@ -72,11 +77,11 @@ void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_edges, dst_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, dst_indices, degree, stream);
   }
 }
 
@@ -95,7 +100,7 @@ void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection dir
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_edges, indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, indices, degree, stream);
   }
 }
 

From b23b5e5f2377933fa60e216ccbca1c0f0bee64bc Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Thu, 23 Apr 2020 18:24:10 -0500
Subject: [PATCH 03/19] checkpoint

---
 cpp/CMakeLists.txt              |   1 +
 cpp/include/graph.hpp           |   7 +-
 cpp/src/comms/mpi/comms_mpi.cpp |  98 ++++++++++++++++++++++++
 cpp/src/comms/mpi/comms_mpi.hpp | 128 +++++++++++++++++++-------------
 cpp/src/structure/graph.cu      |  19 ++---
 cpp/src/utilities/error_utils.h |  24 ------
 6 files changed, 192 insertions(+), 85 deletions(-)
 create mode 100644 cpp/src/comms/mpi/comms_mpi.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6c81f03d387..db1dda9cfcf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -330,6 +330,7 @@ link_directories(
     "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
 
 add_library(cugraph SHARED
+    src/comms/mpi/comms_mpi.cpp
     src/ktruss/ktruss.cu
     src/db/db_object.cu
     src/db/db_parser_integration_test.cu
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index 8b7a163239e..3838fe3dc92 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #pragma once
-
+#include "comms/mpi/comms_mpi.hpp"
 namespace cugraph {
 namespace experimental {
 
@@ -47,8 +47,8 @@ enum class DegreeDirection {
 template <typename VT, typename ET, typename WT>
 class GraphBase {
 public:
+  Comm comm;
   WT *edge_data;     ///< edge weight
-
   GraphProperties          prop;
 
   VT                       number_of_vertices;
@@ -61,8 +61,11 @@ class GraphBase {
    */
   void get_vertex_identifiers(VT *identifiers) const;
 
+  void setCommunicator(Comm& comm_) {comm = comm_;}
+  
   GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_):
     edge_data(edge_data_),
+    comm(0),
     prop(),
     number_of_vertices(number_of_vertices_),
     number_of_edges(number_of_edges_)
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
new file mode 100644
index 00000000000..1f561cb0ea7
--- /dev/null
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "comms/mpi/comms_mpi.hpp"
+
+#include <iostream>
+#include <nccl.h>
+
+namespace cugraph { 
+namespace experimental {
+
+Comm::Comm(int p) : _p{p} {
+#if USE_NCCL
+  // MPI
+  int flag{};
+
+  MPI_TRY(MPI_Initialized(&flag));
+
+  if (flag == false) {
+    int provided{};
+    MPI_TRY(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided));
+    if (provided != MPI_THREAD_MULTIPLE) {
+      MPI_TRY(MPI_ERR_OTHER);
+    }
+    _finalize_mpi = true;
+  }
+
+  MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank));
+  MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size));
+  CUGRAPH_EXPECTS(
+    _p == _mpi_world_size,
+    "Invalid input arguments: p should match the number of MPI processes.");
+
+  _mpi_comm = MPI_COMM_WORLD;
+
+  // CUDA
+
+  CUDA_TRY(cudaGetDeviceCount(&_device_count));
+  _device_id = _mpi_world_rank % _device_count;
+  CUDA_TRY(cudaSetDevice(_device_id));
+
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
+  CUDA_TRY(
+    cudaDeviceGetAttribute(
+      &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
+  int supported{0};
+  CUDA_TRY(cudaDeviceGetAttribute(&supported, cudaDevAttrStreamPrioritiesSupported, _device_id));
+  CUDA_TRY(cudaDeviceGetStreamPriorityRange(&_cuda_stream_least_priority, &_cuda_stream_greatest_priority));
+
+  CUDA_TRY(cudaStreamCreate(&_default_stream));
+
+  // NCCL
+
+  ncclUniqueId nccl_unique_id_p{};
+  if (get_rank() == 0) {
+    NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p));
+  }
+  MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm));
+
+  NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
+#endif
+
+}
+
+Comm::~Comm() {
+#if USE_NCCL
+  // NCCL
+  ncclCommDestroy(_nccl_comm);
+
+  // CUDA
+  for (auto& stream : _extra_streams) {
+    cudaStreamDestroy(stream);
+  }
+  cudaStreamDestroy(_default_stream);
+
+  if (_finalize_mpi) {
+    MPI_Finalize();
+  }
+#endif
+}
+} }//namespace
diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
index 983fd480ad7..1e80c2285f8 100644
--- a/cpp/src/comms/mpi/comms_mpi.hpp
+++ b/cpp/src/comms/mpi/comms_mpi.hpp
@@ -14,22 +14,55 @@
  * limitations under the License.
  */
 
-// snmg utils
-// Author: Alex Fender afender@nvidia.com
- 
+
 #pragma once
+
+#define USE_NCCL 1
+
+#if USE_NCCL
 #include <mpi.h>
 #include <nccl.h>
+#endif
+
 #include <omp.h>
 #include <unistd.h>
 #include <vector>
-#include "mem_utils.h"
-#include "basic_kernels.cuh"
-
-#define USE_NCCL 1
+#include "utilities/error_utils.h"
 
 namespace cugraph { 
-namespace opg {
+namespace experimental {
+
+/**---------------------------------------------------------------------------*
+ * @brief Exception thrown when a NCCL error is encountered.
+ *
+ *---------------------------------------------------------------------------**/
+struct nccl_error : public std::runtime_error {
+  nccl_error(std::string const& message) : std::runtime_error(message) {}
+};
+
+inline void throw_nccl_error(ncclResult_t error, const char* file,
+                             unsigned int line) {
+  throw nccl_error(
+      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
+                  std::to_string(line) + ": " + ncclGetErrorString(error)});
+}
+
+#if USE_NCCL
+#define NCCL_TRY(call) {                                           \
+  ncclResult_t nccl_status = (call);                               \
+  if (nccl_status!= ncclSuccess) {                                 \
+    throw_nccl_error(nccl_status, __FILE__, __LINE__); \
+  }                                                                \
+} 
+
+// MPI errors are expected to be fatal before reaching this.
+// Fix me : improve when adding raft comms
+#define MPI_TRY(cmd) {                           \
+  int e = cmd;                                   \
+  if ( e != MPI_SUCCESS ) {                      \
+    CUGRAPH_FAIL("Failed: MPI error");           \
+  }                                              \
+}                                               
 
 template <typename value_t>
 constexpr MPI_Datatype get_mpi_type() {
@@ -79,7 +112,7 @@ constexpr MPI_Datatype get_mpi_type() {
     CUGRAPH_FAIL("unsupported type");
   }
 }
-#if USE_NCCL
+
 template <typename value_t>
 constexpr ncclDataType_t get_nccl_type() {
   if (std::is_integral<value_t>::value) {
@@ -122,7 +155,7 @@ constexpr ncclDataType_t get_nccl_type() {
     CUGRAPH_FAIL("unsupported type");
   }
 }
-#endif
+
 enum class ReduceOp { SUM, MAX, MIN };
 
 constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
@@ -140,7 +173,6 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
   }
 }
 
-#if USE_NCCL
 constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
   if (reduce_op == ReduceOp::SUM) {
     return ncclSum;
@@ -161,75 +193,71 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
 class Comm 
 { 
   private:
-  int _p_x{0};
-  int _p_y{0};
-
-  int _mpi_world_rank{0};
-  int _mpi_world_size{0};
-  bool _finalize_mpi{false};
+    int _p{0};
 
-  int _device_id{0};
-  int _device_count{0};
+    int _mpi_world_rank{0};
+    int _mpi_world_size{0};
+    bool _finalize_mpi{false};
 
-  std::vector<void*> _p_ipc_mems{};
-  std::vector<size_t> _local_ipc_mem_offsets{};
+    int _device_id{0};
+    int _device_count{0};
 
-  int _sm_count_per_device{0};
-  int _max_grid_dim_1D{0};
-  int _max_block_dim_1D{0};
-  int _l2_cache_size{0};
-  int _shared_memory_size_per_sm{0};
-  int _cuda_stream_least_priority{0};
-  int _cuda_stream_greatest_priority{0};
+    std::vector<void*> _p_ipc_mems{};
+    std::vector<size_t> _local_ipc_mem_offsets{};
 
-  MPI_Comm _mpi_comm_p_x{};
-  MPI_Comm _mpi_comm_p_y{};
-  MPI_Comm _mpi_comm_p{};
+    int _sm_count_per_device{0};
+    int _max_grid_dim_1D{0};
+    int _max_block_dim_1D{0};
+    int _l2_cache_size{0};
+    int _shared_memory_size_per_sm{0};
+    int _cuda_stream_least_priority{0};
+    int _cuda_stream_greatest_priority{0};
 
-  cudaStream_t _default_stream{};
-  std::vector<cudaStream_t> _extra_streams{};
+    cudaStream_t _default_stream{};
+    std::vector<cudaStream_t> _extra_streams{};
 
-  ncclComm_t _nccl_comm{};
-  
+#if USE_NCCL
+    MPI_Comm _mpi_comm{};
+    ncclComm_t _nccl_comm{};
+ #endif
+   
   public: 
-    Comm();
+    Comm(int p);
     ~Comm();
     int get_rank() const { return _mpi_world_rank; }
     int get_p() const { return _mpi_world_size; }
     int get_dev() const { return _device_id; }
     int get_dev_count() const { return _device_count; }
     int get_sm_count() const { return _sm_count_per_device; }
-    bool is_master() const return { return (_mpi_world_rank == 0)? true : false; }
-    void init();
+    bool is_master() const { return (_mpi_world_rank == 0)? true : false; }
 
-    template <typename val_t>
-    void allgather (size_t size, val_t* sendbuff, val_t* recvbuff);
+    template <typename value_t>
+    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff);
 
-    template <typename val_t>
-    void allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op);
+    template <typename value_t>
+    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op);
 
 };
 
 // Wait for all host threads 
 void sync_all() {
   cudaDeviceSynchronize();
+#if USE_NCCL
   MPI_Barrier(MPI_COMM_WORLD);
+#endif
 }
 
-template <typename val_t>
-void Comm::allgather (size_t size, val_t* sendbuff, val_t* recvbuff) {
+template <typename value_t>
+void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) {
 #if USE_NCCL
-  if(typeid(val_t) == typeid(float))
     NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
-  else 
-    CUGRAPH_FAIL("allgather needs floats");
 #endif
 }
 
-template <typename val_t>
-void Comm::allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op) {
+template <typename value_t>
+void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) {
 #if USE_NCCL
-    NCCL_TRY(ncclAllReduce(const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)););
+    NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
 #endif
 }
 
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index a8d7082f0ca..2a27faa6236 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -14,6 +14,8 @@
 #include "utilities/error_utils.h"
 #include "utilities/cuda_utils.cuh"
 
+
+
 namespace {
 
 template <typename vertex_t, typename edge_t>
@@ -32,12 +34,12 @@ void degree_from_offsets(vertex_t number_of_vertices,
 }
 
 template <typename vertex_t, typename edge_t>
-void degree_from_vertex_ids(vertex_t number_of_vertices,
+void degree_from_vertex_ids(cugraph::experimental::Comm& comm,
+                            vertex_t number_of_vertices,
                             edge_t number_of_edges,
                             vertex_t const *indices,
                             edge_t *degree,
-                            cudaStream_t stream,
-                            cugraph::Comm env = 0) {
+                            cudaStream_t stream) {
 
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    thrust::make_counting_iterator<edge_t>(0),
@@ -45,9 +47,7 @@ void degree_from_vertex_ids(vertex_t number_of_vertices,
                    [indices, degree] __device__ (edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
-  comm.allreduce(cugraph::Communicator::P_X, cugraph::Target::DEVICE,
-                degree, degree, d_out_degrees.size(),
-                cugraph::ReduceOp::SUM, env.get_default_cuda_stream());
+  comm.allreduce(degree, degree, number_of_vertices, cugraph::ReduceOp::SUM);
 }
 
 } //namespace anonymous
@@ -55,6 +55,7 @@ void degree_from_vertex_ids(vertex_t number_of_vertices,
 namespace cugraph {
 namespace experimental {
 
+
 template <typename VT, typename ET, typename WT>
 void GraphBase<VT,ET,WT>::get_vertex_identifiers(VT *identifiers) const {
   cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
@@ -77,11 +78,11 @@ void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, dst_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, dst_indices, degree, stream);
   }
 }
 
@@ -100,7 +101,7 @@ void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection dir
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, indices, degree, stream);
   }
 }
 
diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h
index f8342c680d7..f18716a3a34 100644
--- a/cpp/src/utilities/error_utils.h
+++ b/cpp/src/utilities/error_utils.h
@@ -50,14 +50,6 @@ struct logic_error : public std::logic_error {
 struct cuda_error : public std::runtime_error {
   cuda_error(std::string const& message) : std::runtime_error(message) {}
 };
-/**---------------------------------------------------------------------------*
- * @brief Exception thrown when a NCCL error is encountered.
- *
- *---------------------------------------------------------------------------**/
-struct nccl_error : public std::runtime_error {
-  nccl_error(std::string const& message) : std::runtime_error(message) {}
-};
-
 }  // namespace cugraph
 
 #define STRINGIFY_DETAIL(x) #x
@@ -134,13 +126,6 @@ inline void throw_cuda_error(cudaError_t error, const char* file,
                   cudaGetErrorName(error) + " " + cudaGetErrorString(error)});
 }
 
-inline void throw_nccl_error(ncclResult_t error, const char* file,
-                             unsigned int line) {
-  throw cugraph::nccl_error(
-      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
-                  std::to_string(line) + ": " + ncclGetErrorString(error)});
-}
-
 inline void check_stream(cudaStream_t stream, const char* file,
                          unsigned int line) {
   cudaError_t error{cudaSuccess};
@@ -224,12 +209,3 @@ inline void check_stream(cudaStream_t stream, const char* file,
   CUGRAPH_EXPECTS(graph != nullptr, "Invalid API parameter: graph is NULL"); \
   CUGRAPH_EXPECTS(graph->adjList != nullptr || graph->edgeList != nullptr, "Invalid API parameter: graph is empty");
 
-#define NCCL_TRY(cmd) {                                      \
-    ncclResult_t nccl_status = cmd;                           \
-    if (nccl_status!= ncclSuccess) {                          \
-      printf("NCCL failure %s:%d '%s'\n",                     \
-          __FILE__,__LINE__,ncclGetErrorString(nccl_status)); \
-      FAIL();                                                 \
-    }                                                         \
-  } 
-}
\ No newline at end of file

From a89328ec582225d830a6e739d016f2556e4357ad Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Fri, 24 Apr 2020 17:15:19 -0500
Subject: [PATCH 04/19] builds

---
 cpp/src/comms/mpi/comms_mpi.cpp |  7 +++++++
 cpp/src/comms/mpi/comms_mpi.hpp | 18 ++++++------------
 cpp/src/structure/graph.cu      |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 1f561cb0ea7..67ed76d36bc 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -95,4 +95,11 @@ Comm::~Comm() {
   }
 #endif
 }
+
+void Comm::barrier() {
+  cudaDeviceSynchronize();
+#if USE_NCCL
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
 } }//namespace
diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
index 1e80c2285f8..22afc234b8d 100644
--- a/cpp/src/comms/mpi/comms_mpi.hpp
+++ b/cpp/src/comms/mpi/comms_mpi.hpp
@@ -231,31 +231,25 @@ class Comm
     int get_sm_count() const { return _sm_count_per_device; }
     bool is_master() const { return (_mpi_world_rank == 0)? true : false; }
 
+    void barrier();
+
     template <typename value_t>
-    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff);
+    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const;
 
     template <typename value_t>
-    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op);
+    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const;
 
 };
 
-// Wait for all host threads 
-void sync_all() {
-  cudaDeviceSynchronize();
-#if USE_NCCL
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-}
-
 template <typename value_t>
-void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) {
+void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const {
 #if USE_NCCL
     NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
 #endif
 }
 
 template <typename value_t>
-void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) {
+void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const {
 #if USE_NCCL
     NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
 #endif
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 2a27faa6236..391c6538be2 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -34,7 +34,7 @@ void degree_from_offsets(vertex_t number_of_vertices,
 }
 
 template <typename vertex_t, typename edge_t>
-void degree_from_vertex_ids(cugraph::experimental::Comm& comm,
+void degree_from_vertex_ids(const cugraph::experimental::Comm& comm,
                             vertex_t number_of_vertices,
                             edge_t number_of_edges,
                             vertex_t const *indices,
@@ -47,7 +47,7 @@ void degree_from_vertex_ids(cugraph::experimental::Comm& comm,
                    [indices, degree] __device__ (edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
-  comm.allreduce(degree, degree, number_of_vertices, cugraph::ReduceOp::SUM);
+  comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM);
 }
 
 } //namespace anonymous

From f5bc95969fb95830e1be5c8e91710e1a53d4c332 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Fri, 24 Apr 2020 18:15:59 -0500
Subject: [PATCH 05/19] test checkpoint

---
 cpp/include/graph.hpp         |  2 +-
 cpp/tests/CMakeLists.txt      |  5 +++
 cpp/tests/nccl/degree_test.cu | 74 +++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 cpp/tests/nccl/degree_test.cu

diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index 3838fe3dc92..0cb70093b01 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -61,7 +61,7 @@ class GraphBase {
    */
   void get_vertex_identifiers(VT *identifiers) const;
 
-  void setCommunicator(Comm& comm_) {comm = comm_;}
+  void set_communicator(Comm& comm_) {comm = comm_;}
   
   GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_):
     edge_data(edge_data_),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8c850924730..5b37fea735f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -285,6 +285,11 @@ if (BUILD_MPI)
         "${CMAKE_CURRENT_SOURCE_DIR}/nccl/nccl_test.cu")
 
     ConfigureTest(NCCL_TEST "${NCCL_TEST_SRC}" "")
+
+    set(NCCL_DEGREE_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nccl/degree_test.cu")
+
+    ConfigureTest(NCCL_DEGREE_TEST "${NCCL_DEGREE_TEST_SRC}" "")
 endif(BUILD_MPI)
 
 ###################################################################################################
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
new file mode 100644
index 00000000000..2397a487d18
--- /dev/null
+++ b/cpp/tests/nccl/degree_test.cu
@@ -0,0 +1,74 @@
+#include "gtest/gtest.h"
+#include <cugraph.h>
+#include "test_utils.h"
+#include <string.h>
+#include <mpi.h>
+#include <nccl.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <graph.hpp>
+#include "comms/mpi/comms_mpi.hpp"
+
+// ref Degree on the host
+template<typename idx_t>
+void ref_degree_h(std::vector<idx_t> & ind_h,
+                  std::vector<idx_t> & degree) {
+  for (size_t i = 0; i < degree.size(); i++)
+    degree[i] = 0;
+  for (size_t i = 0; i < ind_h.size(); i++)
+    degree[ind_h[i]] += 1;
+}
+
+TEST(degree, success)
+{
+  int v = 6;
+
+  //host
+  std::vector<int> src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5}, 
+                   dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3};
+  std::vector<int> degree_h(v, 0.0), degree_ref(v, 0.0);
+
+  //device
+  thrust::device_vector<int> src_d(src_h.begin(), src_h.begin()+src_h.size());
+  thrust::device_vector<int> dest_d(dest_h.begin(), dest_h.begin()+dest_h.size());
+  thrust::device_vector<int> degree_d(v);
+
+  //MG
+  int p;
+  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p));
+  cugraph::experimental::Comm comm(p);
+
+  // print mg info
+  printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev());
+
+  // load cugraph (fix me : split per process)
+  cugraph::experimental::GraphCOO<int,int,float> G(thrust::raw_pointer_cast(src_d.data()), 
+                                                   thrust::raw_pointer_cast(dest_d.data()), 
+                                                   nullptr, degree_h.size(), dest_h.size());
+  G.set_communicator(comm);
+
+  // IN degree
+  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
+  thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
+  ref_degree_h(dest_h, degree_ref); 
+  for (size_t j = 0; j < degree_h.size(); ++j)
+    EXPECT_EQ(degree_ref[j], degree_h[j]);
+
+  // OUT degree
+  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT);
+  thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
+  ref_degree_h(src_h, degree_ref);
+  for (size_t j = 0; j < degree_h.size(); ++j)
+    EXPECT_EQ(degree_ref[j], degree_h[j]);
+}
+
+int main( int argc, char** argv )
+{
+    testing::InitGoogleTest(&argc,argv);
+    MPI_Init(&argc, &argv);
+    rmmInitialize(nullptr);
+    int rc = RUN_ALL_TESTS();
+    rmmFinalize();
+    MPI_Finalize();
+    return rc;
+}
\ No newline at end of file

From dd29ec99fd4aee7f2da5e8176c8e228ff7974f06 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Mon, 27 Apr 2020 19:15:19 -0500
Subject: [PATCH 06/19] checkpoint np 1 passes

---
 cpp/include/graph.hpp           |  2 +-
 cpp/src/comms/mpi/comms_mpi.cpp | 21 +++++----------------
 cpp/src/comms/mpi/comms_mpi.hpp |  8 +++-----
 cpp/tests/nccl/degree_test.cu   |  4 ++++
 4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index 0cb70093b01..ee8d6e95fc0 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -65,7 +65,7 @@ class GraphBase {
   
   GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_):
     edge_data(edge_data_),
-    comm(0),
+    comm(),
     prop(),
     number_of_vertices(number_of_vertices_),
     number_of_edges(number_of_edges_)
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 67ed76d36bc..167594a783c 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -40,9 +40,8 @@ Comm::Comm(int p) : _p{p} {
 
   MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank));
   MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size));
-  CUGRAPH_EXPECTS(
-    _p == _mpi_world_size,
-    "Invalid input arguments: p should match the number of MPI processes.");
+  CUGRAPH_EXPECTS( (_p == _mpi_world_size), 
+                   "Invalid input arguments: p should match the number of MPI processes.");
 
   _mpi_comm = MPI_COMM_WORLD;
 
@@ -60,11 +59,6 @@ Comm::Comm(int p) : _p{p} {
   CUDA_TRY(
     cudaDeviceGetAttribute(
       &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
-  int supported{0};
-  CUDA_TRY(cudaDeviceGetAttribute(&supported, cudaDevAttrStreamPrioritiesSupported, _device_id));
-  CUDA_TRY(cudaDeviceGetStreamPriorityRange(&_cuda_stream_least_priority, &_cuda_stream_greatest_priority));
-
-  CUDA_TRY(cudaStreamCreate(&_default_stream));
 
   // NCCL
 
@@ -73,8 +67,8 @@ Comm::Comm(int p) : _p{p} {
     NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p));
   }
   MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm));
-
   NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
+  _finalize_nccl = true;
 #endif
 
 }
@@ -82,13 +76,8 @@ Comm::Comm(int p) : _p{p} {
 Comm::~Comm() {
 #if USE_NCCL
   // NCCL
-  ncclCommDestroy(_nccl_comm);
-
-  // CUDA
-  for (auto& stream : _extra_streams) {
-    cudaStreamDestroy(stream);
-  }
-  cudaStreamDestroy(_default_stream);
+  if (_finalize_nccl)
+    ncclCommDestroy(_nccl_comm);
 
   if (_finalize_mpi) {
     MPI_Finalize();
diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
index 22afc234b8d..3521c9abae7 100644
--- a/cpp/src/comms/mpi/comms_mpi.hpp
+++ b/cpp/src/comms/mpi/comms_mpi.hpp
@@ -198,6 +198,8 @@ class Comm
     int _mpi_world_rank{0};
     int _mpi_world_size{0};
     bool _finalize_mpi{false};
+    bool _finalize_nccl{false};
+
 
     int _device_id{0};
     int _device_count{0};
@@ -210,11 +212,6 @@ class Comm
     int _max_block_dim_1D{0};
     int _l2_cache_size{0};
     int _shared_memory_size_per_sm{0};
-    int _cuda_stream_least_priority{0};
-    int _cuda_stream_greatest_priority{0};
-
-    cudaStream_t _default_stream{};
-    std::vector<cudaStream_t> _extra_streams{};
 
 #if USE_NCCL
     MPI_Comm _mpi_comm{};
@@ -222,6 +219,7 @@ class Comm
  #endif
    
   public: 
+    Comm(){};
     Comm(int p);
     ~Comm();
     int get_rank() const { return _mpi_world_rank; }
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 2397a487d18..bbb5a006eb6 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -49,10 +49,14 @@ TEST(degree, success)
 
   // IN degree
   G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
+
+  std::cout<< "passed"<<std::endl;
   thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
   ref_degree_h(dest_h, degree_ref); 
   for (size_t j = 0; j < degree_h.size(); ++j)
     EXPECT_EQ(degree_ref[j], degree_h[j]);
+  
+   thrust::fill (degree_d.begin(), degree_d.end(), 0);
 
   // OUT degree
   G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT);

From 60cc7eeff86745d695929bdf7ca4b393c9813cf6 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 28 Apr 2020 18:16:25 -0500
Subject: [PATCH 07/19] added edge list partitioning of test input and fixes

---
 cpp/src/structure/graph.cu    |  1 +
 cpp/tests/nccl/degree_test.cu | 75 ++++++++++++++++++++++++++---------
 2 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 391c6538be2..1cfe6d56e2a 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -47,6 +47,7 @@ void degree_from_vertex_ids(const cugraph::experimental::Comm& comm,
                    [indices, degree] __device__ (edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
+  std::cout<< number_of_vertices<<" "<< number_of_edges<<std::endl;
   comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM);
 }
 
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index bbb5a006eb6..15b0a751520 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -19,51 +19,90 @@ void ref_degree_h(std::vector<idx_t> & ind_h,
     degree[ind_h[i]] += 1;
 }
 
+// global to local offsets by shifting all offsets by the first offset value
+template <typename T>
+void shift_by_front(std::vector<T> & v) {
+  auto start = v.front();
+  for (auto i = size_t{0}; i < v.size(); ++i)
+    v[i] -= start;
+}
+
+// 1D partitioning such as each GPU has about the same number of edges
+template <typename T>
+void opg_edge_partioning(int r, int p, std::vector<T> & ind_h, std::vector<size_t> & part_offset,  size_t & e_loc) {
+
+  //set first and last partition offsets
+  part_offset[0] = 0;
+  part_offset[p] = ind_h.size();
+  //part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end()));
+  auto loc_nnz = ind_h.size()/p;
+  for (int i=1; i<p; i++) {
+    //get the first vertex ID of each partition
+    auto start_nnz = i*loc_nnz;
+    auto start_v = 0;
+    for (auto j = size_t{0}; j < ind_h.size(); ++j) {
+      if (j >= start_nnz) {
+        start_v = j;
+        break;
+      }
+    }
+    part_offset[i] = start_v;
+  }
+  e_loc = part_offset[r+1] - part_offset[r];
+}
 TEST(degree, success)
 {
   int v = 6;
 
   //host
-  std::vector<int> src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5}, 
-                   dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3};
+  std::vector<int> src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, 
+                   dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1};
   std::vector<int> degree_h(v, 0.0), degree_ref(v, 0.0);
 
-  //device
-  thrust::device_vector<int> src_d(src_h.begin(), src_h.begin()+src_h.size());
-  thrust::device_vector<int> dest_d(dest_h.begin(), dest_h.begin()+dest_h.size());
-  thrust::device_vector<int> degree_d(v);
+
 
   //MG
   int p;
   MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p));
   cugraph::experimental::Comm comm(p);
+  std::vector<size_t> part_offset(p + 1);
+  auto i = comm.get_rank();
+  size_t e_loc;
+
+  opg_edge_partioning(i, p, src_h, part_offset, e_loc);
+  sleep(i);
+  for (auto j = part_offset.begin(); j != part_offset.end(); ++j)
+      std::cout << *j << ' ';
+  std::cout << std::endl;
+  std::cout<< "eloc: "<< e_loc <<std::endl;
+  
+  std::vector<int> src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc),
+                   dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc);
+  shift_by_front(src_loc_h);
+
 
   // print mg info
   printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev());
 
+  //local device
+  thrust::device_vector<int> src_d(src_loc_h.begin(), src_loc_h.end());
+  thrust::device_vector<int> dest_d(dest_loc_h.begin(), dest_loc_h.end());
+  thrust::device_vector<int> degree_d(v);
+
   // load cugraph (fix me : split per process)
   cugraph::experimental::GraphCOO<int,int,float> G(thrust::raw_pointer_cast(src_d.data()), 
                                                    thrust::raw_pointer_cast(dest_d.data()), 
-                                                   nullptr, degree_h.size(), dest_h.size());
+                                                   nullptr, degree_h.size(), e_loc);
   G.set_communicator(comm);
 
-  // IN degree
-  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
-
-  std::cout<< "passed"<<std::endl;
-  thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
-  ref_degree_h(dest_h, degree_ref); 
-  for (size_t j = 0; j < degree_h.size(); ++j)
-    EXPECT_EQ(degree_ref[j], degree_h[j]);
-  
-   thrust::fill (degree_d.begin(), degree_d.end(), 0);
-
   // OUT degree
   G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT);
   thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
   ref_degree_h(src_h, degree_ref);
+  sleep(i);
   for (size_t j = 0; j < degree_h.size(); ++j)
     EXPECT_EQ(degree_ref[j], degree_h[j]);
+    std::cout<< "Rank "<< i << " done checking." <<std::endl;
 }
 
 int main( int argc, char** argv )

From a70332557764486bed23d9ad71544e9b50dcf776 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 28 Apr 2020 18:32:38 -0500
Subject: [PATCH 08/19] more fixes and cleanup

---
 cpp/src/structure/graph.cu    | 5 ++++-
 cpp/tests/nccl/degree_test.cu | 9 +++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 1cfe6d56e2a..d0ade029462 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -47,7 +47,6 @@ void degree_from_vertex_ids(const cugraph::experimental::Comm& comm,
                    [indices, degree] __device__ (edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
-  std::cout<< number_of_vertices<<" "<< number_of_edges<<std::endl;
   comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM);
 }
 
@@ -79,6 +78,8 @@ void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
+    if (GraphBase<VT,ET,WT>::comm.get_p()); // FixMe retrieve global source indexing for the allreduce work
+      CUGRAPH_FAIL("OPG degree not implemented for OUT degree");
     degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
   }
 
@@ -98,6 +99,8 @@ void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection dir
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
+    if (GraphBase<VT,ET,WT>::comm.get_p());
+      CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work
     degree_from_offsets(GraphBase<VT,ET,WT>::number_of_vertices, offsets, degree, stream);
   }
 
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 15b0a751520..3b44ed1ce86 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -70,12 +70,13 @@ TEST(degree, success)
   size_t e_loc;
 
   opg_edge_partioning(i, p, src_h, part_offset, e_loc);
+  #ifdef OPG_VERBOSE
   sleep(i);
   for (auto j = part_offset.begin(); j != part_offset.end(); ++j)
       std::cout << *j << ' ';
   std::cout << std::endl;
   std::cout<< "eloc: "<< e_loc <<std::endl;
-  
+  #endif
   std::vector<int> src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc),
                    dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc);
   shift_by_front(src_loc_h);
@@ -96,10 +97,10 @@ TEST(degree, success)
   G.set_communicator(comm);
 
   // OUT degree
-  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT);
+  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
   thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
-  ref_degree_h(src_h, degree_ref);
-  sleep(i);
+  ref_degree_h(dest_h, degree_ref);
+  //sleep(i);
   for (size_t j = 0; j < degree_h.size(); ++j)
     EXPECT_EQ(degree_ref[j], degree_h[j]);
     std::cout<< "Rank "<< i << " done checking." <<std::endl;

From 60e9ee83417cc91199a7f1eefff3a71a30f533a1 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 28 Apr 2020 18:39:51 -0500
Subject: [PATCH 09/19] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index afe42a6a3b7..3c99b94d2bf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # cuGraph 0.14.0 (Date TBD)
 
 ## New Features
+- PR #840 OPG degree
 
 ## Improvements
 - PR #764 Updated sssp and bfs with GraphCSR, removed gdf_column, added nullptr weights test for sssp

From 18094e1f5ef5d6f2ef65347f19b57e7423225b04 Mon Sep 17 00:00:00 2001
From: Alex Fender <afender@nvidia.com>
Date: Tue, 28 Apr 2020 18:42:17 -0500
Subject: [PATCH 10/19] fixed comment

---
 cpp/tests/nccl/degree_test.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 3b44ed1ce86..83910e73c24 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -90,7 +90,7 @@ TEST(degree, success)
   thrust::device_vector<int> dest_d(dest_loc_h.begin(), dest_loc_h.end());
   thrust::device_vector<int> degree_d(v);
 
-  // load cugraph (fix me : split per process)
+  // load local chunck to cugraph
   cugraph::experimental::GraphCOO<int,int,float> G(thrust::raw_pointer_cast(src_d.data()), 
                                                    thrust::raw_pointer_cast(dest_d.data()), 
                                                    nullptr, degree_h.size(), e_loc);
@@ -115,4 +115,4 @@ int main( int argc, char** argv )
     rmmFinalize();
     MPI_Finalize();
     return rc;
-}
\ No newline at end of file
+}

From 008327f13d4fb77d39d56a8fe4454507037fc8a0 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 29 Apr 2020 12:51:56 -0500
Subject: [PATCH 11/19] fixes

---
 cpp/CMakeLists.txt              | 4 ++++
 cpp/src/comms/mpi/comms_mpi.cpp | 1 -
 cpp/src/comms/mpi/comms_mpi.hpp | 8 +++-----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index db1dda9cfcf..6031dc8ccef 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -398,6 +398,10 @@ add_library(cugraph SHARED
 #
 add_dependencies(cugraph cugunrock)
 
+if (BUILD_MPI)
+    add_compile_definitions(USE_NCCL=1)
+endif (BUILD_MPI)
+
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
 target_include_directories(cugraph
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 167594a783c..24112048509 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -86,7 +86,6 @@ Comm::~Comm() {
 }
 
 void Comm::barrier() {
-  cudaDeviceSynchronize();
 #if USE_NCCL
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
index 3521c9abae7..9b1ca8c3126 100644
--- a/cpp/src/comms/mpi/comms_mpi.hpp
+++ b/cpp/src/comms/mpi/comms_mpi.hpp
@@ -17,8 +17,6 @@
 
 #pragma once
 
-#define USE_NCCL 1
-
 #if USE_NCCL
 #include <mpi.h>
 #include <nccl.h>
@@ -32,6 +30,9 @@
 namespace cugraph { 
 namespace experimental {
 
+enum class ReduceOp { SUM, MAX, MIN };
+
+#if USE_NCCL
 /**---------------------------------------------------------------------------*
  * @brief Exception thrown when a NCCL error is encountered.
  *
@@ -47,7 +48,6 @@ inline void throw_nccl_error(ncclResult_t error, const char* file,
                   std::to_string(line) + ": " + ncclGetErrorString(error)});
 }
 
-#if USE_NCCL
 #define NCCL_TRY(call) {                                           \
   ncclResult_t nccl_status = (call);                               \
   if (nccl_status!= ncclSuccess) {                                 \
@@ -156,8 +156,6 @@ constexpr ncclDataType_t get_nccl_type() {
   }
 }
 
-enum class ReduceOp { SUM, MAX, MIN };
-
 constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
   if (reduce_op == ReduceOp::SUM) {
     return MPI_SUM;

From 40b448f3cbfce1becd271106b15389de3715acce Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 29 Apr 2020 15:52:00 -0500
Subject: [PATCH 12/19] non-mpi path

---
 cpp/tests/nccl/degree_test.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 83910e73c24..1c7221076d4 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -2,8 +2,6 @@
 #include <cugraph.h>
 #include "test_utils.h"
 #include <string.h>
-#include <mpi.h>
-#include <nccl.h>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 #include <graph.hpp>

From 1c6b26720bbf39ad03ad0f7a67783d0ab0d5aa5c Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Thu, 30 Apr 2020 17:01:18 -0500
Subject: [PATCH 13/19] headers reorg for comms deployment

---
 cpp/include/comms_mpi.hpp       |  77 ++++++++++
 cpp/include/graph.hpp           |   2 +-
 cpp/src/comms/mpi/comms_mpi.cpp | 182 ++++++++++++++++++++++-
 cpp/src/comms/mpi/comms_mpi.hpp | 254 --------------------------------
 cpp/tests/nccl/degree_test.cu   |   2 +-
 5 files changed, 259 insertions(+), 258 deletions(-)
 create mode 100644 cpp/include/comms_mpi.hpp
 delete mode 100644 cpp/src/comms/mpi/comms_mpi.hpp

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
new file mode 100644
index 00000000000..c6cb5339f54
--- /dev/null
+++ b/cpp/include/comms_mpi.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#pragma once
+
+#if USE_NCCL
+#include <mpi.h>
+#include <nccl.h>
+#endif
+
+namespace cugraph { 
+namespace experimental {
+
+enum class ReduceOp { SUM, MAX, MIN };
+
+// basic info about the snmg env setup
+class Comm 
+{ 
+  private:
+    int _p{0};
+
+    int _mpi_world_rank{0};
+    int _mpi_world_size{0};
+    bool _finalize_mpi{false};
+    bool _finalize_nccl{false};
+
+
+    int _device_id{0};
+    int _device_count{0};
+
+    int _sm_count_per_device{0};
+    int _max_grid_dim_1D{0};
+    int _max_block_dim_1D{0};
+    int _l2_cache_size{0};
+    int _shared_memory_size_per_sm{0};
+
+#if USE_NCCL
+    MPI_Comm _mpi_comm{};
+    ncclComm_t _nccl_comm{};
+ #endif
+   
+  public: 
+    Comm(){};
+    Comm(int p);
+    ~Comm();
+    int get_rank() const { return _mpi_world_rank; }
+    int get_p() const { return _mpi_world_size; }
+    int get_dev() const { return _device_id; }
+    int get_dev_count() const { return _device_count; }
+    int get_sm_count() const { return _sm_count_per_device; }
+    bool is_master() const { return (_mpi_world_rank == 0)? true : false; }
+
+    void barrier();
+
+    template <typename value_t>
+    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const;
+
+    template <typename value_t>
+    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const;
+
+};
+
+} } //namespace
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index ee8d6e95fc0..aac5e9116a1 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #pragma once
-#include "comms/mpi/comms_mpi.hpp"
+#include <comms_mpi.hpp>
 namespace cugraph {
 namespace experimental {
 
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 24112048509..6a1846ee35b 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -14,13 +14,168 @@
  * limitations under the License.
  */
 
-#include "comms/mpi/comms_mpi.hpp"
 
 #include <iostream>
-#include <nccl.h>
+#include <comms_mpi.hpp>
+#include <vector>
+#include "utilities/error_utils.h"
 
 namespace cugraph { 
 namespace experimental {
+#if USE_NCCL
+
+/**---------------------------------------------------------------------------*
+ * @brief Exception thrown when a NCCL error is encountered.
+ *
+ *---------------------------------------------------------------------------**/
+struct nccl_error : public std::runtime_error {
+  nccl_error(std::string const& message) : std::runtime_error(message) {}
+};
+
+inline void throw_nccl_error(ncclResult_t error, const char* file,
+                             unsigned int line) {
+  throw nccl_error(
+      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
+                  std::to_string(line) + ": " + ncclGetErrorString(error)});
+}
+
+#define NCCL_TRY(call) {                                           \
+  ncclResult_t nccl_status = (call);                               \
+  if (nccl_status!= ncclSuccess) {                                 \
+    throw_nccl_error(nccl_status, __FILE__, __LINE__); \
+  }                                                                \
+} 
+// MPI errors are expected to be fatal before reaching this.
+// Fix me : improve when adding raft comms
+#define MPI_TRY(cmd) {                           \
+  int e = cmd;                                   \
+  if ( e != MPI_SUCCESS ) {                      \
+    CUGRAPH_FAIL("Failed: MPI error");           \
+  }                                              \
+}                                               
+
+template <typename value_t>
+constexpr MPI_Datatype get_mpi_type() {
+  if (std::is_integral<value_t>::value) {
+    if (std::is_signed<value_t>::value) {
+      if (sizeof(value_t) == 1) {
+        return MPI_INT8_T;
+      }
+      else if (sizeof(value_t) == 2) {
+        return MPI_INT16_T;
+      }
+      else if (sizeof(value_t) == 4) {
+        return MPI_INT32_T;
+      }
+      else if (sizeof(value_t) == 8) {
+        return MPI_INT64_T;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+    else {
+      if (sizeof(value_t) == 1) {
+        return MPI_UINT8_T;
+      }
+      else if (sizeof(value_t) == 2) {
+        return MPI_UINT16_T;
+      }
+      else if (sizeof(value_t) == 4) {
+        return MPI_UINT32_T;
+      }
+      else if (sizeof(value_t) == 8) {
+        return MPI_UINT64_T;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+  }
+  else if(std::is_same<value_t, float>::value) {
+    return MPI_FLOAT;
+  }
+  else if(std::is_same<value_t, double>::value) {
+    return MPI_DOUBLE;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+
+template <typename value_t>
+constexpr ncclDataType_t get_nccl_type() {
+  if (std::is_integral<value_t>::value) {
+    if (std::is_signed<value_t>::value) {
+      if (sizeof(value_t) == 1) {
+        return ncclInt8;
+      }
+      else if (sizeof(value_t) == 4) {
+        return ncclInt32;
+      }
+      else if (sizeof(value_t) == 8) {
+        return ncclInt64;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+    else {
+      if (sizeof(value_t) == 1) {
+        return ncclUint8;
+      }
+      else if (sizeof(value_t) == 4) {
+        return ncclUint32;
+      }
+      else if (sizeof(value_t) == 8) {
+        return ncclUint64;
+      }
+      else {
+        CUGRAPH_FAIL("unsupported type");
+      }
+    }
+  }
+  else if(std::is_same<value_t, float>::value) {
+    return ncclFloat32;
+  }
+  else if(std::is_same<value_t, double>::value) {
+    return ncclFloat64;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+
+constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
+  if (reduce_op == ReduceOp::SUM) {
+    return MPI_SUM;
+  }
+  else if (reduce_op == ReduceOp::MAX) {
+    return MPI_MAX;
+  }
+  else if (reduce_op == ReduceOp::MIN) {
+    return MPI_MIN;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+
+constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
+  if (reduce_op == ReduceOp::SUM) {
+    return ncclSum;
+  }
+  else if (reduce_op == ReduceOp::MAX) {
+    return ncclMax;
+  }
+  else if (reduce_op == ReduceOp::MIN) {
+    return ncclMin;
+  }
+  else {
+    CUGRAPH_FAIL("unsupported type");
+  }
+}
+#endif
 
 Comm::Comm(int p) : _p{p} {
 #if USE_NCCL
@@ -90,4 +245,27 @@ void Comm::barrier() {
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
 }
+
+template <typename value_t>
+void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const {
+#if USE_NCCL
+    NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
+#endif
+}
+
+template <typename value_t>
+void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const {
+#if USE_NCCL
+    NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
+#endif
+}
+
+//explicit
+template void Comm::allgather<int>(size_t size, int* sendbuff, int* recvbuff) const;
+template void Comm::allgather<float>(size_t size, float* sendbuff, float* recvbuff) const;
+template void Comm::allgather<double>(size_t size, double* sendbuff, double* recvbuff) const;
+template void Comm::allreduce<int>(size_t size, int* sendbuff, int* recvbuff, ReduceOp reduce_op) const;
+template void Comm::allreduce<float>(size_t size, float* sendbuff, float* recvbuff, ReduceOp reduce_op) const;
+template void Comm::allreduce<double>(size_t size, double* sendbuff, double* recvbuff, ReduceOp reduce_op) const;
+  
 } }//namespace
diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp
deleted file mode 100644
index 9b1ca8c3126..00000000000
--- a/cpp/src/comms/mpi/comms_mpi.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#pragma once
-
-#if USE_NCCL
-#include <mpi.h>
-#include <nccl.h>
-#endif
-
-#include <omp.h>
-#include <unistd.h>
-#include <vector>
-#include "utilities/error_utils.h"
-
-namespace cugraph { 
-namespace experimental {
-
-enum class ReduceOp { SUM, MAX, MIN };
-
-#if USE_NCCL
-/**---------------------------------------------------------------------------*
- * @brief Exception thrown when a NCCL error is encountered.
- *
- *---------------------------------------------------------------------------**/
-struct nccl_error : public std::runtime_error {
-  nccl_error(std::string const& message) : std::runtime_error(message) {}
-};
-
-inline void throw_nccl_error(ncclResult_t error, const char* file,
-                             unsigned int line) {
-  throw nccl_error(
-      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
-                  std::to_string(line) + ": " + ncclGetErrorString(error)});
-}
-
-#define NCCL_TRY(call) {                                           \
-  ncclResult_t nccl_status = (call);                               \
-  if (nccl_status!= ncclSuccess) {                                 \
-    throw_nccl_error(nccl_status, __FILE__, __LINE__); \
-  }                                                                \
-} 
-
-// MPI errors are expected to be fatal before reaching this.
-// Fix me : improve when adding raft comms
-#define MPI_TRY(cmd) {                           \
-  int e = cmd;                                   \
-  if ( e != MPI_SUCCESS ) {                      \
-    CUGRAPH_FAIL("Failed: MPI error");           \
-  }                                              \
-}                                               
-
-template <typename value_t>
-constexpr MPI_Datatype get_mpi_type() {
-  if (std::is_integral<value_t>::value) {
-    if (std::is_signed<value_t>::value) {
-      if (sizeof(value_t) == 1) {
-        return MPI_INT8_T;
-      }
-      else if (sizeof(value_t) == 2) {
-        return MPI_INT16_T;
-      }
-      else if (sizeof(value_t) == 4) {
-        return MPI_INT32_T;
-      }
-      else if (sizeof(value_t) == 8) {
-        return MPI_INT64_T;
-      }
-      else {
-        CUGRAPH_FAIL("unsupported type");
-      }
-    }
-    else {
-      if (sizeof(value_t) == 1) {
-        return MPI_UINT8_T;
-      }
-      else if (sizeof(value_t) == 2) {
-        return MPI_UINT16_T;
-      }
-      else if (sizeof(value_t) == 4) {
-        return MPI_UINT32_T;
-      }
-      else if (sizeof(value_t) == 8) {
-        return MPI_UINT64_T;
-      }
-      else {
-        CUGRAPH_FAIL("unsupported type");
-      }
-    }
-  }
-  else if(std::is_same<value_t, float>::value) {
-    return MPI_FLOAT;
-  }
-  else if(std::is_same<value_t, double>::value) {
-    return MPI_DOUBLE;
-  }
-  else {
-    CUGRAPH_FAIL("unsupported type");
-  }
-}
-
-template <typename value_t>
-constexpr ncclDataType_t get_nccl_type() {
-  if (std::is_integral<value_t>::value) {
-    if (std::is_signed<value_t>::value) {
-      if (sizeof(value_t) == 1) {
-        return ncclInt8;
-      }
-      else if (sizeof(value_t) == 4) {
-        return ncclInt32;
-      }
-      else if (sizeof(value_t) == 8) {
-        return ncclInt64;
-      }
-      else {
-        CUGRAPH_FAIL("unsupported type");
-      }
-    }
-    else {
-      if (sizeof(value_t) == 1) {
-        return ncclUint8;
-      }
-      else if (sizeof(value_t) == 4) {
-        return ncclUint32;
-      }
-      else if (sizeof(value_t) == 8) {
-        return ncclUint64;
-      }
-      else {
-        CUGRAPH_FAIL("unsupported type");
-      }
-    }
-  }
-  else if(std::is_same<value_t, float>::value) {
-    return ncclFloat32;
-  }
-  else if(std::is_same<value_t, double>::value) {
-    return ncclFloat64;
-  }
-  else {
-    CUGRAPH_FAIL("unsupported type");
-  }
-}
-
-constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
-  if (reduce_op == ReduceOp::SUM) {
-    return MPI_SUM;
-  }
-  else if (reduce_op == ReduceOp::MAX) {
-    return MPI_MAX;
-  }
-  else if (reduce_op == ReduceOp::MIN) {
-    return MPI_MIN;
-  }
-  else {
-    CUGRAPH_FAIL("unsupported type");
-  }
-}
-
-constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
-  if (reduce_op == ReduceOp::SUM) {
-    return ncclSum;
-  }
-  else if (reduce_op == ReduceOp::MAX) {
-    return ncclMax;
-  }
-  else if (reduce_op == ReduceOp::MIN) {
-    return ncclMin;
-  }
-  else {
-    CUGRAPH_FAIL("unsupported type");
-  }
-}
-#endif
-
-// basic info about the snmg env setup
-class Comm 
-{ 
-  private:
-    int _p{0};
-
-    int _mpi_world_rank{0};
-    int _mpi_world_size{0};
-    bool _finalize_mpi{false};
-    bool _finalize_nccl{false};
-
-
-    int _device_id{0};
-    int _device_count{0};
-
-    std::vector<void*> _p_ipc_mems{};
-    std::vector<size_t> _local_ipc_mem_offsets{};
-
-    int _sm_count_per_device{0};
-    int _max_grid_dim_1D{0};
-    int _max_block_dim_1D{0};
-    int _l2_cache_size{0};
-    int _shared_memory_size_per_sm{0};
-
-#if USE_NCCL
-    MPI_Comm _mpi_comm{};
-    ncclComm_t _nccl_comm{};
- #endif
-   
-  public: 
-    Comm(){};
-    Comm(int p);
-    ~Comm();
-    int get_rank() const { return _mpi_world_rank; }
-    int get_p() const { return _mpi_world_size; }
-    int get_dev() const { return _device_id; }
-    int get_dev_count() const { return _device_count; }
-    int get_sm_count() const { return _sm_count_per_device; }
-    bool is_master() const { return (_mpi_world_rank == 0)? true : false; }
-
-    void barrier();
-
-    template <typename value_t>
-    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const;
-
-    template <typename value_t>
-    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const;
-
-};
-
-template <typename value_t>
-void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const {
-#if USE_NCCL
-    NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
-#endif
-}
-
-template <typename value_t>
-void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const {
-#if USE_NCCL
-    NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
-#endif
-}
-
-} } //namespace
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 1c7221076d4..828ccbcb94b 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -5,7 +5,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 #include <graph.hpp>
-#include "comms/mpi/comms_mpi.hpp"
+#include <comms_mpi.hpp>
 
 // ref Degree on the host
 template<typename idx_t>

From 1643eeebb34bdeec77ee13fbda27130d9ac2c37a Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Fri, 1 May 2020 16:41:46 -0500
Subject: [PATCH 14/19] constructor for python and fixes

---
 cpp/include/comms_mpi.hpp       | 14 +++++++-------
 cpp/src/comms/mpi/comms_mpi.cpp | 30 +++++++++++++++++++++++++-----
 cpp/src/structure/graph.cu      |  4 ++--
 3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index c6cb5339f54..8019ebc809c 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -32,13 +32,10 @@ class Comm
 { 
   private:
     int _p{0};
-
-    int _mpi_world_rank{0};
-    int _mpi_world_size{0};
+    int _rank{0};
     bool _finalize_mpi{false};
     bool _finalize_nccl{false};
 
-
     int _device_id{0};
     int _device_count{0};
 
@@ -56,13 +53,16 @@ class Comm
   public: 
     Comm(){};
     Comm(int p);
+  #if USE_NCCL
+    Comm(ncclComm_t comm, int size, int rank);
+  #endif
     ~Comm();
-    int get_rank() const { return _mpi_world_rank; }
-    int get_p() const { return _mpi_world_size; }
+    int get_rank() const { return _rank; }
+    int get_p() const { return _p; }
     int get_dev() const { return _device_id; }
     int get_dev_count() const { return _device_count; }
     int get_sm_count() const { return _sm_count_per_device; }
-    bool is_master() const { return (_mpi_world_rank == 0)? true : false; }
+    bool is_master() const { return (_rank == 0)? true : false; }
 
     void barrier();
 
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 6a1846ee35b..5b7390abfc4 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -180,7 +180,7 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
 Comm::Comm(int p) : _p{p} {
 #if USE_NCCL
   // MPI
-  int flag{};
+  int flag{}, mpi_world_size;
 
   MPI_TRY(MPI_Initialized(&flag));
 
@@ -193,9 +193,9 @@ Comm::Comm(int p) : _p{p} {
     _finalize_mpi = true;
   }
 
-  MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank));
-  MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size));
-  CUGRAPH_EXPECTS( (_p == _mpi_world_size), 
+  MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank));
+  MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size));
+  CUGRAPH_EXPECTS( (_p == mpi_world_size), 
                    "Invalid input arguments: p should match the number of MPI processes.");
 
   _mpi_comm = MPI_COMM_WORLD;
@@ -203,7 +203,7 @@ Comm::Comm(int p) : _p{p} {
   // CUDA
 
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
-  _device_id = _mpi_world_rank % _device_count;
+  _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs
   CUDA_TRY(cudaSetDevice(_device_id));
 
   CUDA_TRY(
@@ -228,6 +228,26 @@ Comm::Comm(int p) : _p{p} {
 
 }
 
+#if USE_NCCL
+Comm::Comm(ncclComm_t comm, int size, int rank)
+  : _nccl_comm(comm), _p(size), _rank(rank) {
+
+  // CUDA
+  CUDA_TRY(cudaGetDeviceCount(&_device_count));
+  _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs
+  CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if python takes care of this
+
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
+  CUDA_TRY(
+    cudaDeviceGetAttribute(
+      &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
+}
+#endif
+
 Comm::~Comm() {
 #if USE_NCCL
   // NCCL
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index d0ade029462..26a67275d19 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -78,7 +78,7 @@ void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT,ET,WT>::comm.get_p()); // FixMe retrieve global source indexing for the allreduce work
+    if (GraphBase<VT,ET,WT>::comm.get_p()) // FixMe retrieve global source indexing for the allreduce work
       CUGRAPH_FAIL("OPG degree not implemented for OUT degree");
     degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
   }
@@ -99,7 +99,7 @@ void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection dir
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT,ET,WT>::comm.get_p());
+    if (GraphBase<VT,ET,WT>::comm.get_p())
       CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work
     degree_from_offsets(GraphBase<VT,ET,WT>::number_of_vertices, offsets, degree, stream);
   }

From 7ac19b1a1f17f71e9ff884ea2882ca3956045fb8 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Fri, 1 May 2020 16:52:42 -0500
Subject: [PATCH 15/19] naming

---
 cpp/CMakeLists.txt              |  2 +-
 cpp/include/comms_mpi.hpp       |  6 +++---
 cpp/src/comms/mpi/comms_mpi.cpp | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6031dc8ccef..d1d56e94318 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -399,7 +399,7 @@ add_library(cugraph SHARED
 add_dependencies(cugraph cugunrock)
 
 if (BUILD_MPI)
-    add_compile_definitions(USE_NCCL=1)
+    add_compile_definitions(ENABLE_OPG=1)
 endif (BUILD_MPI)
 
 ###################################################################################################
diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index 8019ebc809c..27944aea103 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#if USE_NCCL
+#if ENABLE_OPG
 #include <mpi.h>
 #include <nccl.h>
 #endif
@@ -45,7 +45,7 @@ class Comm
     int _l2_cache_size{0};
     int _shared_memory_size_per_sm{0};
 
-#if USE_NCCL
+#if ENABLE_OPG
     MPI_Comm _mpi_comm{};
     ncclComm_t _nccl_comm{};
  #endif
@@ -53,7 +53,7 @@ class Comm
   public: 
     Comm(){};
     Comm(int p);
-  #if USE_NCCL
+  #if ENABLE_OPG
     Comm(ncclComm_t comm, int size, int rank);
   #endif
     ~Comm();
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 5b7390abfc4..b2fdda1a00c 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -22,7 +22,7 @@
 
 namespace cugraph { 
 namespace experimental {
-#if USE_NCCL
+#if ENABLE_OPG
 
 /**---------------------------------------------------------------------------*
  * @brief Exception thrown when a NCCL error is encountered.
@@ -178,7 +178,7 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
 #endif
 
 Comm::Comm(int p) : _p{p} {
-#if USE_NCCL
+#if ENABLE_OPG
   // MPI
   int flag{}, mpi_world_size;
 
@@ -228,7 +228,7 @@ Comm::Comm(int p) : _p{p} {
 
 }
 
-#if USE_NCCL
+#if ENABLE_OPG
 Comm::Comm(ncclComm_t comm, int size, int rank)
   : _nccl_comm(comm), _p(size), _rank(rank) {
 
@@ -249,7 +249,7 @@ Comm::Comm(ncclComm_t comm, int size, int rank)
 #endif
 
 Comm::~Comm() {
-#if USE_NCCL
+#if ENABLE_OPG
   // NCCL
   if (_finalize_nccl)
     ncclCommDestroy(_nccl_comm);
@@ -261,21 +261,21 @@ Comm::~Comm() {
 }
 
 void Comm::barrier() {
-#if USE_NCCL
+#if ENABLE_OPG
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
 }
 
 template <typename value_t>
 void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const {
-#if USE_NCCL
+#if ENABLE_OPG
     NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
 #endif
 }
 
 template <typename value_t>
 void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const {
-#if USE_NCCL
+#if ENABLE_OPG
     NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
 #endif
 }

From 5b45a5c582acff7465e9c8d03e7df293d3856264 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 5 May 2020 15:20:07 -0500
Subject: [PATCH 16/19] Clang formating

---
 cpp/include/comms_mpi.hpp        |  78 +++++------
 cpp/include/graph.hpp            | 214 +++++++++++++++--------------
 cpp/src/comms/mpi/comms_mpi.cpp  | 229 +++++++++++++++----------------
 cpp/src/structure/graph.cu       |  87 ++++++------
 cpp/src/utilities/cuda_utils.cuh |  58 ++++----
 cpp/tests/nccl/degree_test.cu    |  94 +++++++------
 6 files changed, 390 insertions(+), 370 deletions(-)

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index 27944aea103..68fbf4f27cc 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -14,64 +14,62 @@
  * limitations under the License.
  */
 
-
 #pragma once
-
 #if ENABLE_OPG
 #include <mpi.h>
 #include <nccl.h>
 #endif
 
-namespace cugraph { 
+namespace cugraph {
 namespace experimental {
 
 enum class ReduceOp { SUM, MAX, MIN };
 
 // basic info about the snmg env setup
-class Comm 
-{ 
-  private:
-    int _p{0};
-    int _rank{0};
-    bool _finalize_mpi{false};
-    bool _finalize_nccl{false};
+class Comm {
+private:
+  int _p{0};
+  int _rank{0};
+  bool _finalize_mpi{false};
+  bool _finalize_nccl{false};
 
-    int _device_id{0};
-    int _device_count{0};
+  int _device_id{0};
+  int _device_count{0};
 
-    int _sm_count_per_device{0};
-    int _max_grid_dim_1D{0};
-    int _max_block_dim_1D{0};
-    int _l2_cache_size{0};
-    int _shared_memory_size_per_sm{0};
+  int _sm_count_per_device{0};
+  int _max_grid_dim_1D{0};
+  int _max_block_dim_1D{0};
+  int _l2_cache_size{0};
+  int _shared_memory_size_per_sm{0};
 
 #if ENABLE_OPG
-    MPI_Comm _mpi_comm{};
-    ncclComm_t _nccl_comm{};
- #endif
-   
-  public: 
-    Comm(){};
-    Comm(int p);
-  #if ENABLE_OPG
-    Comm(ncclComm_t comm, int size, int rank);
-  #endif
-    ~Comm();
-    int get_rank() const { return _rank; }
-    int get_p() const { return _p; }
-    int get_dev() const { return _device_id; }
-    int get_dev_count() const { return _device_count; }
-    int get_sm_count() const { return _sm_count_per_device; }
-    bool is_master() const { return (_rank == 0)? true : false; }
+  MPI_Comm _mpi_comm{};
+  ncclComm_t _nccl_comm{};
+#endif
 
-    void barrier();
+public:
+  Comm(){};
+  Comm(int p);
+#if ENABLE_OPG
+  Comm(ncclComm_t comm, int size, int rank);
+#endif
+  ~Comm();
+  int get_rank() const { return _rank; }
+  int get_p() const { return _p; }
+  int get_dev() const { return _device_id; }
+  int get_dev_count() const { return _device_count; }
+  int get_sm_count() const { return _sm_count_per_device; }
+  bool is_master() const { return (_rank == 0) ? true : false; }
 
-    template <typename value_t>
-    void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const;
+  void barrier();
 
-    template <typename value_t>
-    void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const;
+  template <typename value_t>
+  void allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const;
 
+  template <typename value_t>
+  void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff,
+                 ReduceOp reduce_op) const;
 };
 
-} } //namespace
+} // namespace experimental
+} // namespace cugraph
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index aac5e9116a1..1c11f17311a 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -18,7 +18,7 @@
 namespace cugraph {
 namespace experimental {
 
-enum class PropType{PROP_UNDEF, PROP_FALSE, PROP_TRUE};
+enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE };
 
 struct GraphProperties {
   bool directed{false};
@@ -31,10 +31,10 @@ struct GraphProperties {
 };
 
 enum class DegreeDirection {
-  IN_PLUS_OUT = 0,       ///> Compute sum of in and out degree
-  IN,                    ///> Compute in degree
-  OUT,                   ///> Compute out degree
-  DEGREE_DIRECTION_COUNT 
+  IN_PLUS_OUT = 0, ///> Compute sum of in and out degree
+  IN,              ///> Compute in degree
+  OUT,             ///> Compute out degree
+  DEGREE_DIRECTION_COUNT
 };
 
 /**
@@ -44,32 +44,29 @@ enum class DegreeDirection {
  * @tparam ET   Type of edge id
  * @tparam WT   Type of weight
  */
-template <typename VT, typename ET, typename WT>
-class GraphBase {
+template <typename VT, typename ET, typename WT> class GraphBase {
 public:
   Comm comm;
-  WT *edge_data;     ///< edge weight
-  GraphProperties          prop;
+  WT *edge_data; ///< edge weight
+  GraphProperties prop;
 
-  VT                       number_of_vertices;
-  ET                       number_of_edges;
+  VT number_of_vertices;
+  ET number_of_edges;
 
   /**
    * @brief      Fill the identifiers array with the vertex identifiers.
    *
-   * @param[out]    identifier      Pointer to device memory to store the vertex identifiers
+   * @param[out]    identifier      Pointer to device memory to store the vertex
+   * identifiers
    */
   void get_vertex_identifiers(VT *identifiers) const;
 
-  void set_communicator(Comm& comm_) {comm = comm_;}
-  
-  GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_):
-    edge_data(edge_data_),
-    comm(),
-    prop(),
-    number_of_vertices(number_of_vertices_),
-    number_of_edges(number_of_edges_)
-  {}
+  void set_communicator(Comm &comm_) { comm = comm_; }
+
+  GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
+      : edge_data(edge_data_), comm(), prop(),
+        number_of_vertices(number_of_vertices_),
+        number_of_edges(number_of_edges_) {}
 };
 
 /**
@@ -80,66 +77,73 @@ class GraphBase {
  * @tparam WT   Type of weight
  */
 template <typename VT, typename ET, typename WT>
-class GraphCOO: public GraphBase<VT, ET, WT> {
+class GraphCOO : public GraphBase<VT, ET, WT> {
 public:
-  VT *src_indices{nullptr};   ///< rowInd
-  VT *dst_indices{nullptr};   ///< colInd
+  VT *src_indices{nullptr}; ///< rowInd
+  VT *dst_indices{nullptr}; ///< colInd
 
   /**
    * @brief     Computes degree(in, out, in+out) of all the nodes of a Graph
    *
    * @throws     cugraph::logic_error when an error occurs.
    *
-   * @param[out] degree                Device array of size V (V is number of vertices) initialized to zeros.
-   *                                   Will contain the computed degree of every vertex.
+   * @param[out] degree                Device array of size V (V is number of
+   * vertices) initialized to zeros. Will contain the computed degree of every
+   * vertex.
    * @param[in]  direction             IN_PLUS_OUT, IN or OUT
    */
   void degree(ET *degree, DegreeDirection direction) const;
-  
+
   /**
    * @brief      Default constructor
    */
-  GraphCOO(): GraphBase<VT,ET,WT>(nullptr, 0, 0) {}
-  
+  GraphCOO() : GraphBase<VT, ET, WT>(nullptr, 0, 0) {}
+
   /**
    * @brief      Wrap existing arrays representing an edge list in a Graph.
    *
-   *             GraphCOO does not own the memory used to represent this graph. This
-   *             function does not allocate memory.
+   *             GraphCOO does not own the memory used to represent this graph.
+   * This function does not allocate memory.
    *
-   * @param  source_indices        This array of size E (number of edges) contains the index of the source for each edge.
-   *                               Indices must be in the range [0, V-1].
-   * @param  destination_indices   This array of size E (number of edges) contains the index of the destination for each edge.
-   *                               Indices must be in the range [0, V-1].
-   * @param  edge_data             This array size E (number of edges) contains the weight for each edge.  This array can be null
-   *                               in which case the graph is considered unweighted.
+   * @param  source_indices        This array of size E (number of edges)
+   * contains the index of the source for each edge. Indices must be in the
+   * range [0, V-1].
+   * @param  destination_indices   This array of size E (number of edges)
+   * contains the index of the destination for each edge. Indices must be in the
+   * range [0, V-1].
+   * @param  edge_data             This array size E (number of edges) contains
+   * the weight for each edge.  This array can be null in which case the graph
+   * is considered unweighted.
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
   GraphCOO(VT *src_indices_, VT *dst_indices_, WT *edge_data_,
-           VT number_of_vertices_, ET number_of_edges_):
-    GraphBase<VT,ET,WT>(edge_data_, number_of_vertices_, number_of_edges_),
-    src_indices(src_indices_), dst_indices(dst_indices_)
-  {}
+           VT number_of_vertices_, ET number_of_edges_)
+      : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_,
+                              number_of_edges_),
+        src_indices(src_indices_), dst_indices(dst_indices_) {}
 };
 
 /**
- * @brief       Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed Sparse Column) format
+ * @brief       Base class for graph stored in CSR (Compressed Sparse Row)
+ * format or CSC (Compressed Sparse Column) format
  *
  * @tparam VT   Type of vertex id
  * @tparam ET   Type of edge id
  * @tparam WT   Type of weight
  */
 template <typename VT, typename ET, typename WT>
-class GraphCompressedSparseBase: public GraphBase<VT,ET,WT> {
+class GraphCompressedSparseBase : public GraphBase<VT, ET, WT> {
 public:
-  ET *offsets{nullptr};       ///< CSR offsets
-  VT *indices{nullptr};       ///< CSR indices
+  ET *offsets{nullptr}; ///< CSR offsets
+  VT *indices{nullptr}; ///< CSR indices
 
   /**
-   * @brief      Fill the identifiers in the array with the source vertex identifiers
+   * @brief      Fill the identifiers in the array with the source vertex
+   * identifiers
    *
-   * @param[out]    src_indices      Pointer to device memory to store the source vertex identifiers
+   * @param[out]    src_indices      Pointer to device memory to store the
+   * source vertex identifiers
    */
   void get_source_indices(VT *src_indices) const;
 
@@ -148,35 +152,35 @@ class GraphCompressedSparseBase: public GraphBase<VT,ET,WT> {
    *
    * @throws     cugraph::logic_error when an error occurs.
    *
-   * @param[out] degree                Device array of size V (V is number of vertices) initialized to zeros.
-   *                                   Will contain the computed degree of every vertex.
-   * @param[in]  x                     Integer value indicating type of degree calculation
-   *                                      0 : in+out degree
-   *                                      1 : in-degree
-   *                                      2 : out-degree
+   * @param[out] degree                Device array of size V (V is number of
+   * vertices) initialized to zeros. Will contain the computed degree of every
+   * vertex.
+   * @param[in]  x                     Integer value indicating type of degree
+   * calculation 0 : in+out degree 1 : in-degree 2 : out-degree
    */
   void degree(ET *degree, DegreeDirection direction) const;
-  
+
   /**
    * @brief      Wrap existing arrays representing adjacency lists in a Graph.
-   *             GraphCSR does not own the memory used to represent this graph. This
-   *             function does not allocate memory.
+   *             GraphCSR does not own the memory used to represent this graph.
+   * This function does not allocate memory.
    *
-   * @param  offsets               This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex.
-   *                               Offsets must be in the range [0, E] (number of edges).
-   * @param  indices               This array of size E contains the index of the destination for each edge.
-   *                               Indices must be in the range [0, V-1].
-   * @param  edge_data             This array of size E (number of edges) contains the weight for each edge.  This
-   *                               array can be null in which case the graph is considered unweighted.
+   * @param  offsets               This array of size V+1 (V is number of
+   * vertices) contains the offset of adjacency lists of every vertex. Offsets
+   * must be in the range [0, E] (number of edges).
+   * @param  indices               This array of size E contains the index of
+   * the destination for each edge. Indices must be in the range [0, V-1].
+   * @param  edge_data             This array of size E (number of edges)
+   * contains the weight for each edge.  This array can be null in which case
+   * the graph is considered unweighted.
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
   GraphCompressedSparseBase(ET *offsets_, VT *indices_, WT *edge_data_,
-                            VT number_of_vertices_, ET number_of_edges_):
-    GraphBase<VT,ET,WT>(edge_data_, number_of_vertices_, number_of_edges_),
-    offsets{offsets_},
-    indices{indices_}
-  {}
+                            VT number_of_vertices_, ET number_of_edges_)
+      : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_,
+                              number_of_edges_),
+        offsets{offsets_}, indices{indices_} {}
 };
 
 /**
@@ -187,31 +191,36 @@ class GraphCompressedSparseBase: public GraphBase<VT,ET,WT> {
  * @tparam WT   Type of weight
  */
 template <typename VT, typename ET, typename WT>
-class GraphCSR: public GraphCompressedSparseBase<VT,ET,WT> {
+class GraphCSR : public GraphCompressedSparseBase<VT, ET, WT> {
 public:
   /**
    * @brief      Default constructor
    */
-  GraphCSR(): GraphCompressedSparseBase<VT,ET,WT>(nullptr, nullptr, nullptr, 0, 0) {}
-  
+  GraphCSR()
+      : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {
+  }
+
   /**
    * @brief      Wrap existing arrays representing adjacency lists in a Graph.
-   *             GraphCSR does not own the memory used to represent this graph. This
-   *             function does not allocate memory.
+   *             GraphCSR does not own the memory used to represent this graph.
+   * This function does not allocate memory.
    *
-   * @param  offsets               This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex.
-   *                               Offsets must be in the range [0, E] (number of edges).
-   * @param  indices               This array of size E contains the index of the destination for each edge.
-   *                               Indices must be in the range [0, V-1].
-   * @param  edge_data             This array of size E (number of edges) contains the weight for each edge.  This
-   *                               array can be null in which case the graph is considered unweighted.
+   * @param  offsets               This array of size V+1 (V is number of
+   * vertices) contains the offset of adjacency lists of every vertex. Offsets
+   * must be in the range [0, E] (number of edges).
+   * @param  indices               This array of size E contains the index of
+   * the destination for each edge. Indices must be in the range [0, V-1].
+   * @param  edge_data             This array of size E (number of edges)
+   * contains the weight for each edge.  This array can be null in which case
+   * the graph is considered unweighted.
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_,
-           VT number_of_vertices_, ET number_of_edges_):
-    GraphCompressedSparseBase<VT,ET,WT>(offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_)
-  {}
+  GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_,
+           ET number_of_edges_)
+      : GraphCompressedSparseBase<VT, ET, WT>(offsets_, indices_, edge_data_,
+                                              number_of_vertices_,
+                                              number_of_edges_) {}
 };
 
 /**
@@ -222,32 +231,37 @@ class GraphCSR: public GraphCompressedSparseBase<VT,ET,WT> {
  * @tparam WT   Type of weight
  */
 template <typename VT, typename ET, typename WT>
-class GraphCSC: public GraphCompressedSparseBase<VT,ET,WT> {
+class GraphCSC : public GraphCompressedSparseBase<VT, ET, WT> {
 public:
   /**
    * @brief      Default constructor
    */
-  GraphCSC(): GraphCompressedSparseBase<VT,ET,WT>(nullptr, nullptr, nullptr, 0, 0) {}
-  
+  GraphCSC()
+      : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {
+  }
+
   /**
-   * @brief      Wrap existing arrays representing transposed adjacency lists in a Graph.
-   *             GraphCSC does not own the memory used to represent this graph. This
-   *             function does not allocate memory.
+   * @brief      Wrap existing arrays representing transposed adjacency lists in
+   * a Graph. GraphCSC does not own the memory used to represent this graph.
+   * This function does not allocate memory.
    *
-   * @param  offsets               This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex.
-   *                               Offsets must be in the range [0, E] (number of edges).
-   * @param  indices               This array of size E contains the index of the destination for each edge.
-   *                               Indices must be in the range [0, V-1].
-   * @param  edge_data             This array of size E (number of edges) contains the weight for each edge.  This array
-   *                               can be null in which case the graph is considered unweighted.
+   * @param  offsets               This array of size V+1 (V is number of
+   * vertices) contains the offset of adjacency lists of every vertex. Offsets
+   * must be in the range [0, E] (number of edges).
+   * @param  indices               This array of size E contains the index of
+   * the destination for each edge. Indices must be in the range [0, V-1].
+   * @param  edge_data             This array of size E (number of edges)
+   * contains the weight for each edge.  This array can be null in which case
+   * the graph is considered unweighted.
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_,
-           VT number_of_vertices_, ET number_of_edges_):
-    GraphCompressedSparseBase<VT,ET,WT>(offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_)
-  {}
+  GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_,
+           ET number_of_edges_)
+      : GraphCompressedSparseBase<VT, ET, WT>(offsets_, indices_, edge_data_,
+                                              number_of_vertices_,
+                                              number_of_edges_) {}
 };
 
-} //namespace experimental
-} //namespace cugraph
+} // namespace experimental
+} // namespace cugraph
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index b2fdda1a00c..2d761d2e2dc 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-
-#include <iostream>
+#include "utilities/error_utils.h"
 #include <comms_mpi.hpp>
+#include <iostream>
 #include <vector>
-#include "utilities/error_utils.h"
 
-namespace cugraph { 
+namespace cugraph {
 namespace experimental {
 #if ENABLE_OPG
 
@@ -29,119 +28,97 @@ namespace experimental {
  *
  *---------------------------------------------------------------------------**/
 struct nccl_error : public std::runtime_error {
-  nccl_error(std::string const& message) : std::runtime_error(message) {}
+  nccl_error(std::string const &message) : std::runtime_error(message) {}
 };
 
-inline void throw_nccl_error(ncclResult_t error, const char* file,
+inline void throw_nccl_error(ncclResult_t error, const char *file,
                              unsigned int line) {
   throw nccl_error(
       std::string{"NCCL error encountered at: " + std::string{file} + ":" +
                   std::to_string(line) + ": " + ncclGetErrorString(error)});
 }
 
-#define NCCL_TRY(call) {                                           \
-  ncclResult_t nccl_status = (call);                               \
-  if (nccl_status!= ncclSuccess) {                                 \
-    throw_nccl_error(nccl_status, __FILE__, __LINE__); \
-  }                                                                \
-} 
+#define NCCL_TRY(call)                                                         \
+  {                                                                            \
+    ncclResult_t nccl_status = (call);                                         \
+    if (nccl_status != ncclSuccess) {                                          \
+      throw_nccl_error(nccl_status, __FILE__, __LINE__);                       \
+    }                                                                          \
+  }
 // MPI errors are expected to be fatal before reaching this.
 // Fix me : improve when adding raft comms
-#define MPI_TRY(cmd) {                           \
-  int e = cmd;                                   \
-  if ( e != MPI_SUCCESS ) {                      \
-    CUGRAPH_FAIL("Failed: MPI error");           \
-  }                                              \
-}                                               
+#define MPI_TRY(cmd)                                                           \
+  {                                                                            \
+    int e = cmd;                                                               \
+    if (e != MPI_SUCCESS) {                                                    \
+      CUGRAPH_FAIL("Failed: MPI error");                                       \
+    }                                                                          \
+  }
 
-template <typename value_t>
-constexpr MPI_Datatype get_mpi_type() {
+template <typename value_t> constexpr MPI_Datatype get_mpi_type() {
   if (std::is_integral<value_t>::value) {
     if (std::is_signed<value_t>::value) {
       if (sizeof(value_t) == 1) {
         return MPI_INT8_T;
-      }
-      else if (sizeof(value_t) == 2) {
+      } else if (sizeof(value_t) == 2) {
         return MPI_INT16_T;
-      }
-      else if (sizeof(value_t) == 4) {
+      } else if (sizeof(value_t) == 4) {
         return MPI_INT32_T;
-      }
-      else if (sizeof(value_t) == 8) {
+      } else if (sizeof(value_t) == 8) {
         return MPI_INT64_T;
-      }
-      else {
+      } else {
         CUGRAPH_FAIL("unsupported type");
       }
-    }
-    else {
+    } else {
       if (sizeof(value_t) == 1) {
         return MPI_UINT8_T;
-      }
-      else if (sizeof(value_t) == 2) {
+      } else if (sizeof(value_t) == 2) {
         return MPI_UINT16_T;
-      }
-      else if (sizeof(value_t) == 4) {
+      } else if (sizeof(value_t) == 4) {
         return MPI_UINT32_T;
-      }
-      else if (sizeof(value_t) == 8) {
+      } else if (sizeof(value_t) == 8) {
         return MPI_UINT64_T;
-      }
-      else {
+      } else {
         CUGRAPH_FAIL("unsupported type");
       }
     }
-  }
-  else if(std::is_same<value_t, float>::value) {
+  } else if (std::is_same<value_t, float>::value) {
     return MPI_FLOAT;
-  }
-  else if(std::is_same<value_t, double>::value) {
+  } else if (std::is_same<value_t, double>::value) {
     return MPI_DOUBLE;
-  }
-  else {
+  } else {
     CUGRAPH_FAIL("unsupported type");
   }
 }
 
-template <typename value_t>
-constexpr ncclDataType_t get_nccl_type() {
+template <typename value_t> constexpr ncclDataType_t get_nccl_type() {
   if (std::is_integral<value_t>::value) {
     if (std::is_signed<value_t>::value) {
       if (sizeof(value_t) == 1) {
         return ncclInt8;
-      }
-      else if (sizeof(value_t) == 4) {
+      } else if (sizeof(value_t) == 4) {
         return ncclInt32;
-      }
-      else if (sizeof(value_t) == 8) {
+      } else if (sizeof(value_t) == 8) {
         return ncclInt64;
-      }
-      else {
+      } else {
         CUGRAPH_FAIL("unsupported type");
       }
-    }
-    else {
+    } else {
       if (sizeof(value_t) == 1) {
         return ncclUint8;
-      }
-      else if (sizeof(value_t) == 4) {
+      } else if (sizeof(value_t) == 4) {
         return ncclUint32;
-      }
-      else if (sizeof(value_t) == 8) {
+      } else if (sizeof(value_t) == 8) {
         return ncclUint64;
-      }
-      else {
+      } else {
         CUGRAPH_FAIL("unsupported type");
       }
     }
-  }
-  else if(std::is_same<value_t, float>::value) {
+  } else if (std::is_same<value_t, float>::value) {
     return ncclFloat32;
-  }
-  else if(std::is_same<value_t, double>::value) {
+  } else if (std::is_same<value_t, double>::value) {
     return ncclFloat64;
-  }
-  else {
+  } else {
     CUGRAPH_FAIL("unsupported type");
   }
 }
@@ -149,14 +126,11 @@ constexpr ncclDataType_t get_nccl_type() {
 constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
   if (reduce_op == ReduceOp::SUM) {
     return MPI_SUM;
-  }
-  else if (reduce_op == ReduceOp::MAX) {
+  } else if (reduce_op == ReduceOp::MAX) {
     return MPI_MAX;
-  }
-  else if (reduce_op == ReduceOp::MIN) {
+  } else if (reduce_op == ReduceOp::MIN) {
     return MPI_MIN;
-  }
-  else {
+  } else {
     CUGRAPH_FAIL("unsupported type");
   }
 }
@@ -164,14 +138,11 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
 constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
   if (reduce_op == ReduceOp::SUM) {
     return ncclSum;
-  }
-  else if (reduce_op == ReduceOp::MAX) {
+  } else if (reduce_op == ReduceOp::MAX) {
     return ncclMax;
-  }
-  else if (reduce_op == ReduceOp::MIN) {
+  } else if (reduce_op == ReduceOp::MIN) {
     return ncclMin;
-  }
-  else {
+  } else {
     CUGRAPH_FAIL("unsupported type");
   }
 }
@@ -195,25 +166,31 @@ Comm::Comm(int p) : _p{p} {
 
   MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank));
   MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size));
-  CUGRAPH_EXPECTS( (_p == mpi_world_size), 
-                   "Invalid input arguments: p should match the number of MPI processes.");
+  CUGRAPH_EXPECTS(
+      (_p == mpi_world_size),
+      "Invalid input arguments: p should match the number of MPI processes.");
 
   _mpi_comm = MPI_COMM_WORLD;
 
   // CUDA
 
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
-  _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs
+  _device_id =
+      _rank %
+      _device_count; // FixMe : assumes each node has the same number of GPUs
   CUDA_TRY(cudaSetDevice(_device_id));
 
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
-  CUDA_TRY(
-    cudaDeviceGetAttribute(
-      &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
+                                  cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm,
+                                  cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                                  _device_id));
 
   // NCCL
 
@@ -221,30 +198,37 @@ Comm::Comm(int p) : _p{p} {
   if (get_rank() == 0) {
     NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p));
   }
-  MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm));
-  NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
+  MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0,
+                    _mpi_comm));
+  NCCL_TRY(
+      ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
   _finalize_nccl = true;
 #endif
-
 }
 
 #if ENABLE_OPG
 Comm::Comm(ncclComm_t comm, int size, int rank)
-  : _nccl_comm(comm), _p(size), _rank(rank) {
+    : _nccl_comm(comm), _p(size), _rank(rank) {
 
   // CUDA
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
-  _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs
-  CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if python takes care of this
+  _device_id =
+      _rank %
+      _device_count; // FixMe : assumes each node has the same number of GPUs
+  CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if
+                                       // python takes care of this
 
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
-  CUDA_TRY(
-    cudaDeviceGetAttribute(
-      &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
+                                  cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize,
+                                  _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm,
+                                  cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                                  _device_id));
 }
 #endif
 
@@ -267,25 +251,38 @@ void Comm::barrier() {
 }
 
 template <typename value_t>
-void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const {
+void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const {
 #if ENABLE_OPG
-    NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), _nccl_comm, cudaStreamDefault));
+  NCCL_TRY(ncclAllGather((const void *)sendbuff, (void *)recvbuff, size,
+                         get_nccl_type<value_t>(), _nccl_comm,
+                         cudaStreamDefault));
 #endif
 }
 
 template <typename value_t>
-void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const {
+void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff,
+                     ReduceOp reduce_op) const {
 #if ENABLE_OPG
-    NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type<value_t>(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
+  NCCL_TRY(ncclAllReduce(
+      (const void *)sendbuff, (void *)recvbuff, size, get_nccl_type<value_t>(),
+      get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
 #endif
 }
 
-//explicit
-template void Comm::allgather<int>(size_t size, int* sendbuff, int* recvbuff) const;
-template void Comm::allgather<float>(size_t size, float* sendbuff, float* recvbuff) const;
-template void Comm::allgather<double>(size_t size, double* sendbuff, double* recvbuff) const;
-template void Comm::allreduce<int>(size_t size, int* sendbuff, int* recvbuff, ReduceOp reduce_op) const;
-template void Comm::allreduce<float>(size_t size, float* sendbuff, float* recvbuff, ReduceOp reduce_op) const;
-template void Comm::allreduce<double>(size_t size, double* sendbuff, double* recvbuff, ReduceOp reduce_op) const;
-  
-} }//namespace
+// explicit
+template void Comm::allgather<int>(size_t size, int *sendbuff,
+                                   int *recvbuff) const;
+template void Comm::allgather<float>(size_t size, float *sendbuff,
+                                     float *recvbuff) const;
+template void Comm::allgather<double>(size_t size, double *sendbuff,
+                                      double *recvbuff) const;
+template void Comm::allreduce<int>(size_t size, int *sendbuff, int *recvbuff,
+                                   ReduceOp reduce_op) const;
+template void Comm::allreduce<float>(size_t size, float *sendbuff,
+                                     float *recvbuff, ReduceOp reduce_op) const;
+template void Comm::allreduce<double>(size_t size, double *sendbuff,
+                                      double *recvbuff,
+                                      ReduceOp reduce_op) const;
+
+} // namespace experimental
+} // namespace cugraph
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 26a67275d19..9c5b0e1a77e 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -1,4 +1,4 @@
- /*
+/*
  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
@@ -9,66 +9,63 @@
  *
  */
 
-#include <graph.hpp>
-#include "utilities/graph_utils.cuh"
-#include "utilities/error_utils.h"
 #include "utilities/cuda_utils.cuh"
+#include "utilities/error_utils.h"
+#include "utilities/graph_utils.cuh"
 
-
+#include <graph.hpp>
 
 namespace {
 
 template <typename vertex_t, typename edge_t>
-void degree_from_offsets(vertex_t number_of_vertices,
-                         edge_t const *offsets,
-                         edge_t *degree,
-                         cudaStream_t stream) {
+void degree_from_offsets(vertex_t number_of_vertices, edge_t const *offsets,
+                         edge_t *degree, cudaStream_t stream) {
 
   // Computes out-degree for x = 0 and x = 2
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    thrust::make_counting_iterator<vertex_t>(0),
                    thrust::make_counting_iterator<vertex_t>(number_of_vertices),
-                   [offsets, degree] __device__ (vertex_t v) {
-                     degree[v] = offsets[v+1]-offsets[v];
+                   [offsets, degree] __device__(vertex_t v) {
+                     degree[v] = offsets[v + 1] - offsets[v];
                    });
 }
 
 template <typename vertex_t, typename edge_t>
-void degree_from_vertex_ids(const cugraph::experimental::Comm& comm,
-                            vertex_t number_of_vertices,
-                            edge_t number_of_edges,
-                            vertex_t const *indices,
-                            edge_t *degree,
+void degree_from_vertex_ids(const cugraph::experimental::Comm &comm,
+                            vertex_t number_of_vertices, edge_t number_of_edges,
+                            vertex_t const *indices, edge_t *degree,
                             cudaStream_t stream) {
 
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    thrust::make_counting_iterator<edge_t>(0),
                    thrust::make_counting_iterator<edge_t>(number_of_edges),
-                   [indices, degree] __device__ (edge_t e) {
+                   [indices, degree] __device__(edge_t e) {
                      cugraph::atomicAdd(degree + indices[e], 1);
                    });
-  comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM);
+  comm.allreduce(number_of_vertices, degree, degree,
+                 cugraph::experimental::ReduceOp::SUM);
 }
 
-} //namespace anonymous
+} // namespace
 
 namespace cugraph {
 namespace experimental {
 
-
 template <typename VT, typename ET, typename WT>
-void GraphBase<VT,ET,WT>::get_vertex_identifiers(VT *identifiers) const {
+void GraphBase<VT, ET, WT>::get_vertex_identifiers(VT *identifiers) const {
   cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBase<VT,ET,WT>::get_source_indices(VT *src_indices) const {
-  CUGRAPH_EXPECTS( offsets != nullptr , "No graph specified");
-  cugraph::detail::offsets_to_indices<VT>(offsets, GraphBase<VT,ET,WT>::number_of_vertices, src_indices);
+void GraphCompressedSparseBase<VT, ET, WT>::get_source_indices(
+    VT *src_indices) const {
+  CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified");
+  cugraph::detail::offsets_to_indices<VT>(
+      offsets, GraphBase<VT, ET, WT>::number_of_vertices, src_indices);
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
+void GraphCOO<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const {
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
   //        in then x should be modified to reflect the expected direction.
@@ -78,18 +75,24 @@ void GraphCOO<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT,ET,WT>::comm.get_p()) // FixMe retrieve global source indexing for the allreduce work
+    if (GraphBase<VT, ET, WT>::comm.get_p()) // FixMe retrieve global source
+                                             // indexing for the allreduce work
       CUGRAPH_FAIL("OPG degree not implemented for OUT degree");
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, src_indices, degree, stream);
+    degree_from_vertex_ids(
+        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
+        GraphBase<VT, ET, WT>::number_of_edges, src_indices, degree, stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, dst_indices, degree, stream);
+    degree_from_vertex_ids(
+        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
+        GraphBase<VT, ET, WT>::number_of_edges, dst_indices, degree, stream);
   }
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection direction) const {
+void GraphCompressedSparseBase<VT, ET, WT>::degree(
+    ET *degree, DegreeDirection direction) const {
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
   //        in then x should be modified to reflect the expected direction.
@@ -99,22 +102,28 @@ void GraphCompressedSparseBase<VT,ET,WT>::degree(ET *degree, DegreeDirection dir
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT,ET,WT>::comm.get_p())
-      CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work
-    degree_from_offsets(GraphBase<VT,ET,WT>::number_of_vertices, offsets, degree, stream);
+    if (GraphBase<VT, ET, WT>::comm.get_p())
+      CUGRAPH_FAIL(
+          "OPG degree not implemented for OUT degree"); // FixMe retrieve global
+                                                        // source indexing for
+                                                        // the allreduce to work
+    degree_from_offsets(GraphBase<VT, ET, WT>::number_of_vertices, offsets,
+                        degree, stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(GraphBase<VT,ET,WT>::comm, GraphBase<VT,ET,WT>::number_of_vertices, GraphBase<VT,ET,WT>::number_of_edges, indices, degree, stream);
+    degree_from_vertex_ids(
+        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
+        GraphBase<VT, ET, WT>::number_of_edges, indices, degree, stream);
   }
 }
 
 // explicit instantiation
 template class GraphBase<int32_t, int32_t, float>;
 template class GraphBase<int32_t, int32_t, double>;
-template class GraphCOO<int32_t,int32_t,float>;
-template class GraphCOO<int32_t,int32_t,double>;
-template class GraphCompressedSparseBase<int32_t,int32_t,float>;
-template class GraphCompressedSparseBase<int32_t,int32_t,double>;
-}
-}
+template class GraphCOO<int32_t, int32_t, float>;
+template class GraphCOO<int32_t, int32_t, double>;
+template class GraphCompressedSparseBase<int32_t, int32_t, float>;
+template class GraphCompressedSparseBase<int32_t, int32_t, double>;
+} // namespace experimental
+} // namespace cugraph
diff --git a/cpp/src/utilities/cuda_utils.cuh b/cpp/src/utilities/cuda_utils.cuh
index fe581af914d..9dbf4568c97 100644
--- a/cpp/src/utilities/cuda_utils.cuh
+++ b/cpp/src/utilities/cuda_utils.cuh
@@ -15,50 +15,54 @@
  */
 #pragma once
 
+#include <thrust/extrema.h>
+
 namespace cugraph {
 //
 //  This should go into RAFT...
 //
-__device__ static __forceinline__ int64_t atomicMin(int64_t* addr, int64_t val) {
-  unsigned long long *addr_as_ull{reinterpret_cast<unsigned long long*>(addr)};
-  unsigned long long *val_addr_as_ull{reinterpret_cast<unsigned long long*>(&val)};
-  unsigned long long  old = *addr_as_ull;
-  unsigned long long  val_as_ull = *val_addr_as_ull;
-  int64_t            *p_old{reinterpret_cast<int64_t*>(&old)};
-  unsigned long long  expected;
+__device__ static __forceinline__ int64_t atomicMin(int64_t *addr,
+                                                    int64_t val) {
+  unsigned long long *addr_as_ull{reinterpret_cast<unsigned long long *>(addr)};
+  unsigned long long *val_addr_as_ull{
+      reinterpret_cast<unsigned long long *>(&val)};
+  unsigned long long old = *addr_as_ull;
+  unsigned long long val_as_ull = *val_addr_as_ull;
+  int64_t *p_old{reinterpret_cast<int64_t *>(&old)};
+  unsigned long long expected;
 
   do {
-      expected = old;
-      old = ::atomicCAS(addr_as_ull,
-                        expected,
-                        thrust::min(val_as_ull, expected));
-    } while (expected != old);
+    expected = old;
+    old = ::atomicCAS(addr_as_ull, expected, thrust::min(val_as_ull, expected));
+  } while (expected != old);
   return *p_old;
 }
 
-__device__ static __forceinline__ int32_t atomicMin(int32_t* addr, int32_t val) {
+__device__ static __forceinline__ int32_t atomicMin(int32_t *addr,
+                                                    int32_t val) {
   return ::atomicMin(addr, val);
 }
 
-__device__ static __forceinline__ int64_t atomicAdd(int64_t* addr, int64_t val) {
-  unsigned long long *addr_as_ull{reinterpret_cast<unsigned long long*>(addr)};
-  unsigned long long *val_addr_as_ull{reinterpret_cast<unsigned long long*>(&val)};
-  unsigned long long  old = *addr_as_ull;
-  unsigned long long  val_as_ull = *val_addr_as_ull;
-  int64_t            *p_old{reinterpret_cast<int64_t*>(&old)};
-  unsigned long long  expected;
+__device__ static __forceinline__ int64_t atomicAdd(int64_t *addr,
+                                                    int64_t val) {
+  unsigned long long *addr_as_ull{reinterpret_cast<unsigned long long *>(addr)};
+  unsigned long long *val_addr_as_ull{
+      reinterpret_cast<unsigned long long *>(&val)};
+  unsigned long long old = *addr_as_ull;
+  unsigned long long val_as_ull = *val_addr_as_ull;
+  int64_t *p_old{reinterpret_cast<int64_t *>(&old)};
+  unsigned long long expected;
 
   do {
-      expected = old;
-      old = ::atomicCAS(addr_as_ull,
-                        expected,
-                        (expected + val_as_ull));
-    } while (expected != old);
+    expected = old;
+    old = ::atomicCAS(addr_as_ull, expected, (expected + val_as_ull));
+  } while (expected != old);
   return *p_old;
 }
 
-__device__ static __forceinline__ int32_t atomicAdd(int32_t* addr, int32_t val) {
+__device__ static __forceinline__ int32_t atomicAdd(int32_t *addr,
+                                                    int32_t val) {
   return ::atomicAdd(addr, val);
 }
 
-} //namespace cugraph
+} // namespace cugraph
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 828ccbcb94b..5041cf94528 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -1,16 +1,15 @@
+#include "test_utils.h"
 #include "gtest/gtest.h"
+#include <comms_mpi.hpp>
 #include <cugraph.h>
-#include "test_utils.h"
+#include <graph.hpp>
 #include <string.h>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
-#include <graph.hpp>
-#include <comms_mpi.hpp>
 
 // ref Degree on the host
-template<typename idx_t>
-void ref_degree_h(std::vector<idx_t> & ind_h,
-                  std::vector<idx_t> & degree) {
+template <typename idx_t>
+void ref_degree_h(std::vector<idx_t> &ind_h, std::vector<idx_t> &degree) {
   for (size_t i = 0; i < degree.size(); i++)
     degree[i] = 0;
   for (size_t i = 0; i < ind_h.size(); i++)
@@ -18,8 +17,7 @@ void ref_degree_h(std::vector<idx_t> & ind_h,
 }
 
 // global to local offsets by shifting all offsets by the first offset value
-template <typename T>
-void shift_by_front(std::vector<T> & v) {
+template <typename T> void shift_by_front(std::vector<T> &v) {
   auto start = v.front();
   for (auto i = size_t{0}; i < v.size(); ++i)
     v[i] -= start;
@@ -27,16 +25,17 @@ void shift_by_front(std::vector<T> & v) {
 
 // 1D partitioning such as each GPU has about the same number of edges
 template <typename T>
-void opg_edge_partioning(int r, int p, std::vector<T> & ind_h, std::vector<size_t> & part_offset,  size_t & e_loc) {
+void opg_edge_partioning(int r, int p, std::vector<T> &ind_h,
+                         std::vector<size_t> &part_offset, size_t &e_loc) {
 
-  //set first and last partition offsets
+  // set first and last partition offsets
   part_offset[0] = 0;
   part_offset[p] = ind_h.size();
-  //part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end()));
-  auto loc_nnz = ind_h.size()/p;
-  for (int i=1; i<p; i++) {
-    //get the first vertex ID of each partition
-    auto start_nnz = i*loc_nnz;
+  // part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end()));
+  auto loc_nnz = ind_h.size() / p;
+  for (int i = 1; i < p; i++) {
+    // get the first vertex ID of each partition
+    auto start_nnz = i * loc_nnz;
     auto start_v = 0;
     for (auto j = size_t{0}; j < ind_h.size(); ++j) {
       if (j >= start_nnz) {
@@ -46,20 +45,17 @@ void opg_edge_partioning(int r, int p, std::vector<T> & ind_h, std::vector<size_
     }
     part_offset[i] = start_v;
   }
-  e_loc = part_offset[r+1] - part_offset[r];
+  e_loc = part_offset[r + 1] - part_offset[r];
 }
-TEST(degree, success)
-{
+TEST(degree, success) {
   int v = 6;
 
-  //host
-  std::vector<int> src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, 
-                   dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1};
+  // host
+  std::vector<int> src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5},
+                   dest_h = {1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1};
   std::vector<int> degree_h(v, 0.0), degree_ref(v, 0.0);
 
-
-
-  //MG
+  // MG
   int p;
   MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p));
   cugraph::experimental::Comm comm(p);
@@ -68,49 +64,51 @@ TEST(degree, success)
   size_t e_loc;
 
   opg_edge_partioning(i, p, src_h, part_offset, e_loc);
-  #ifdef OPG_VERBOSE
+#ifdef OPG_VERBOSE
   sleep(i);
   for (auto j = part_offset.begin(); j != part_offset.end(); ++j)
-      std::cout << *j << ' ';
+    std::cout << *j << ' ';
   std::cout << std::endl;
-  std::cout<< "eloc: "<< e_loc <<std::endl;
-  #endif
-  std::vector<int> src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc),
-                   dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc);
+  std::cout << "eloc: " << e_loc << std::endl;
+#endif
+  std::vector<int> src_loc_h(src_h.begin() + part_offset[i],
+                             src_h.begin() + part_offset[i] + e_loc),
+      dest_loc_h(dest_h.begin() + part_offset[i],
+                 dest_h.begin() + part_offset[i] + e_loc);
   shift_by_front(src_loc_h);
 
-
   // print mg info
-  printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev());
+  printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(),
+         comm.get_dev());
 
-  //local device
+  // local device
   thrust::device_vector<int> src_d(src_loc_h.begin(), src_loc_h.end());
   thrust::device_vector<int> dest_d(dest_loc_h.begin(), dest_loc_h.end());
   thrust::device_vector<int> degree_d(v);
 
   // load local chunck to cugraph
-  cugraph::experimental::GraphCOO<int,int,float> G(thrust::raw_pointer_cast(src_d.data()), 
-                                                   thrust::raw_pointer_cast(dest_d.data()), 
-                                                   nullptr, degree_h.size(), e_loc);
+  cugraph::experimental::GraphCOO<int, int, float> G(
+      thrust::raw_pointer_cast(src_d.data()),
+      thrust::raw_pointer_cast(dest_d.data()), nullptr, degree_h.size(), e_loc);
   G.set_communicator(comm);
 
   // OUT degree
-  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
+  G.degree(thrust::raw_pointer_cast(degree_d.data()),
+           cugraph::experimental::DegreeDirection::IN);
   thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
   ref_degree_h(dest_h, degree_ref);
-  //sleep(i);
+  // sleep(i);
   for (size_t j = 0; j < degree_h.size(); ++j)
     EXPECT_EQ(degree_ref[j], degree_h[j]);
-    std::cout<< "Rank "<< i << " done checking." <<std::endl;
+  std::cout << "Rank " << i << " done checking." << std::endl;
 }
 
-int main( int argc, char** argv )
-{
-    testing::InitGoogleTest(&argc,argv);
-    MPI_Init(&argc, &argv);
-    rmmInitialize(nullptr);
-    int rc = RUN_ALL_TESTS();
-    rmmFinalize();
-    MPI_Finalize();
-    return rc;
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  MPI_Init(&argc, &argv);
+  rmmInitialize(nullptr);
+  int rc = RUN_ALL_TESTS();
+  rmmFinalize();
+  MPI_Finalize();
+  return rc;
 }

From f4c407cb283b91820cd7a5ac0853aa6d3d6ade52 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 5 May 2020 15:38:44 -0500
Subject: [PATCH 17/19] fixmes and copyright

---
 cpp/include/comms_mpi.hpp       |  2 +-
 cpp/src/comms/mpi/comms_mpi.cpp | 10 +++++-----
 cpp/src/structure/graph.cu      | 21 +++++++++++++--------
 cpp/tests/nccl/degree_test.cu   | 17 +++++++++++++++++
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index 68fbf4f27cc..dd32041f2f4 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index 2d761d2e2dc..f02d207789b 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ inline void throw_nccl_error(ncclResult_t error, const char *file,
     }                                                                          \
   }
 // MPI errors are expected to be fatal before reaching this.
-// Fix me : improve when adding raft comms
+// FIXME : improve when adding raft comms
 #define MPI_TRY(cmd)                                                           \
   {                                                                            \
     int e = cmd;                                                               \
@@ -177,7 +177,7 @@ Comm::Comm(int p) : _p{p} {
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
   _device_id =
       _rank %
-      _device_count; // FixMe : assumes each node has the same number of GPUs
+      _device_count; // FIXME : assumes each node has the same number of GPUs
   CUDA_TRY(cudaSetDevice(_device_id));
 
   CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
@@ -214,8 +214,8 @@ Comm::Comm(ncclComm_t comm, int size, int rank)
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
   _device_id =
       _rank %
-      _device_count; // FixMe : assumes each node has the same number of GPUs
-  CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if
+      _device_count; // FIXME : assumes each node has the same number of GPUs
+  CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if
                                        // python takes care of this
 
   CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 9c5b0e1a77e..fb3e8e23b89 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -1,12 +1,17 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "utilities/cuda_utils.cuh"
@@ -75,7 +80,7 @@ void GraphCOO<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT, ET, WT>::comm.get_p()) // FixMe retrieve global source
+    if (GraphBase<VT, ET, WT>::comm.get_p()) // FIXME retrieve global source
                                              // indexing for the allreduce work
       CUGRAPH_FAIL("OPG degree not implemented for OUT degree");
     degree_from_vertex_ids(
@@ -104,7 +109,7 @@ void GraphCompressedSparseBase<VT, ET, WT>::degree(
   if (direction != DegreeDirection::IN) {
     if (GraphBase<VT, ET, WT>::comm.get_p())
       CUGRAPH_FAIL(
-          "OPG degree not implemented for OUT degree"); // FixMe retrieve global
+          "OPG degree not implemented for OUT degree"); // FIXME retrieve global
                                                         // source indexing for
                                                         // the allreduce to work
     degree_from_offsets(GraphBase<VT, ET, WT>::number_of_vertices, offsets,
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 5041cf94528..619fcad7c62 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "test_utils.h"
 #include "gtest/gtest.h"
 #include <comms_mpi.hpp>
@@ -11,6 +27,7 @@
 template <typename idx_t>
 void ref_degree_h(std::vector<idx_t> &ind_h, std::vector<idx_t> &degree) {
   for (size_t i = 0; i < degree.size(); i++)
+
     degree[i] = 0;
   for (size_t i = 0; i < ind_h.size(); i++)
     degree[ind_h[i]] += 1;

From 01ba016716e579e9c57511f33c33509e9c817514 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 5 May 2020 16:01:23 -0500
Subject: [PATCH 18/19] clang2

---
 cpp/include/comms_mpi.hpp       |  11 +-
 cpp/include/graph.hpp           |  95 +++++++++--------
 cpp/src/comms/mpi/comms_mpi.cpp | 183 +++++++++++++++-----------------
 cpp/src/structure/graph.cu      | 106 +++++++++---------
 cpp/tests/nccl/degree_test.cu   |  64 ++++++-----
 5 files changed, 231 insertions(+), 228 deletions(-)

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index dd32041f2f4..c414a043efa 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -27,7 +27,7 @@ enum class ReduceOp { SUM, MAX, MIN };
 
 // basic info about the snmg env setup
 class Comm {
-private:
+ private:
   int _p{0};
   int _rank{0};
   bool _finalize_mpi{false};
@@ -47,7 +47,7 @@ class Comm {
   ncclComm_t _nccl_comm{};
 #endif
 
-public:
+ public:
   Comm(){};
   Comm(int p);
 #if ENABLE_OPG
@@ -67,9 +67,8 @@ class Comm {
   void allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const;
 
   template <typename value_t>
-  void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff,
-                 ReduceOp reduce_op) const;
+  void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const;
 };
 
-} // namespace experimental
-} // namespace cugraph
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index 1c11f17311a..33dd081361f 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -31,9 +31,9 @@ struct GraphProperties {
 };
 
 enum class DegreeDirection {
-  IN_PLUS_OUT = 0, ///> Compute sum of in and out degree
-  IN,              ///> Compute in degree
-  OUT,             ///> Compute out degree
+  IN_PLUS_OUT = 0,  ///> Compute sum of in and out degree
+  IN,               ///> Compute in degree
+  OUT,              ///> Compute out degree
   DEGREE_DIRECTION_COUNT
 };
 
@@ -44,10 +44,11 @@ enum class DegreeDirection {
  * @tparam ET   Type of edge id
  * @tparam WT   Type of weight
  */
-template <typename VT, typename ET, typename WT> class GraphBase {
-public:
+template <typename VT, typename ET, typename WT>
+class GraphBase {
+ public:
   Comm comm;
-  WT *edge_data; ///< edge weight
+  WT *edge_data;  ///< edge weight
   GraphProperties prop;
 
   VT number_of_vertices;
@@ -64,9 +65,13 @@ template <typename VT, typename ET, typename WT> class GraphBase {
   void set_communicator(Comm &comm_) { comm = comm_; }
 
   GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
-      : edge_data(edge_data_), comm(), prop(),
-        number_of_vertices(number_of_vertices_),
-        number_of_edges(number_of_edges_) {}
+    : edge_data(edge_data_),
+      comm(),
+      prop(),
+      number_of_vertices(number_of_vertices_),
+      number_of_edges(number_of_edges_)
+  {
+  }
 };
 
 /**
@@ -78,9 +83,9 @@ template <typename VT, typename ET, typename WT> class GraphBase {
  */
 template <typename VT, typename ET, typename WT>
 class GraphCOO : public GraphBase<VT, ET, WT> {
-public:
-  VT *src_indices{nullptr}; ///< rowInd
-  VT *dst_indices{nullptr}; ///< colInd
+ public:
+  VT *src_indices{nullptr};  ///< rowInd
+  VT *dst_indices{nullptr};  ///< colInd
 
   /**
    * @brief     Computes degree(in, out, in+out) of all the nodes of a Graph
@@ -117,11 +122,13 @@ class GraphCOO : public GraphBase<VT, ET, WT> {
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCOO(VT *src_indices_, VT *dst_indices_, WT *edge_data_,
-           VT number_of_vertices_, ET number_of_edges_)
-      : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_,
-                              number_of_edges_),
-        src_indices(src_indices_), dst_indices(dst_indices_) {}
+  GraphCOO(
+    VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
+    : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_, number_of_edges_),
+      src_indices(src_indices_),
+      dst_indices(dst_indices_)
+  {
+  }
 };
 
 /**
@@ -134,9 +141,9 @@ class GraphCOO : public GraphBase<VT, ET, WT> {
  */
 template <typename VT, typename ET, typename WT>
 class GraphCompressedSparseBase : public GraphBase<VT, ET, WT> {
-public:
-  ET *offsets{nullptr}; ///< CSR offsets
-  VT *indices{nullptr}; ///< CSR indices
+ public:
+  ET *offsets{nullptr};  ///< CSR offsets
+  VT *indices{nullptr};  ///< CSR indices
 
   /**
    * @brief      Fill the identifiers in the array with the source vertex
@@ -176,11 +183,13 @@ class GraphCompressedSparseBase : public GraphBase<VT, ET, WT> {
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCompressedSparseBase(ET *offsets_, VT *indices_, WT *edge_data_,
-                            VT number_of_vertices_, ET number_of_edges_)
-      : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_,
-                              number_of_edges_),
-        offsets{offsets_}, indices{indices_} {}
+  GraphCompressedSparseBase(
+    ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
+    : GraphBase<VT, ET, WT>(edge_data_, number_of_vertices_, number_of_edges_),
+      offsets{offsets_},
+      indices{indices_}
+  {
+  }
 };
 
 /**
@@ -192,13 +201,11 @@ class GraphCompressedSparseBase : public GraphBase<VT, ET, WT> {
  */
 template <typename VT, typename ET, typename WT>
 class GraphCSR : public GraphCompressedSparseBase<VT, ET, WT> {
-public:
+ public:
   /**
    * @brief      Default constructor
    */
-  GraphCSR()
-      : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {
-  }
+  GraphCSR() : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {}
 
   /**
    * @brief      Wrap existing arrays representing adjacency lists in a Graph.
@@ -216,11 +223,11 @@ class GraphCSR : public GraphCompressedSparseBase<VT, ET, WT> {
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_,
-           ET number_of_edges_)
-      : GraphCompressedSparseBase<VT, ET, WT>(offsets_, indices_, edge_data_,
-                                              number_of_vertices_,
-                                              number_of_edges_) {}
+  GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
+    : GraphCompressedSparseBase<VT, ET, WT>(
+        offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_)
+  {
+  }
 };
 
 /**
@@ -232,13 +239,11 @@ class GraphCSR : public GraphCompressedSparseBase<VT, ET, WT> {
  */
 template <typename VT, typename ET, typename WT>
 class GraphCSC : public GraphCompressedSparseBase<VT, ET, WT> {
-public:
+ public:
   /**
    * @brief      Default constructor
    */
-  GraphCSC()
-      : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {
-  }
+  GraphCSC() : GraphCompressedSparseBase<VT, ET, WT>(nullptr, nullptr, nullptr, 0, 0) {}
 
   /**
    * @brief      Wrap existing arrays representing transposed adjacency lists in
@@ -256,12 +261,12 @@ class GraphCSC : public GraphCompressedSparseBase<VT, ET, WT> {
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_,
-           ET number_of_edges_)
-      : GraphCompressedSparseBase<VT, ET, WT>(offsets_, indices_, edge_data_,
-                                              number_of_vertices_,
-                                              number_of_edges_) {}
+  GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_)
+    : GraphCompressedSparseBase<VT, ET, WT>(
+        offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_)
+  {
+  }
 };
 
-} // namespace experimental
-} // namespace cugraph
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp
index f02d207789b..f473c0a1939 100644
--- a/cpp/src/comms/mpi/comms_mpi.cpp
+++ b/cpp/src/comms/mpi/comms_mpi.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "utilities/error_utils.h"
 #include <comms_mpi.hpp>
 #include <iostream>
 #include <vector>
+#include "utilities/error_utils.h"
 
 namespace cugraph {
 namespace experimental {
@@ -31,31 +31,28 @@ struct nccl_error : public std::runtime_error {
   nccl_error(std::string const &message) : std::runtime_error(message) {}
 };
 
-inline void throw_nccl_error(ncclResult_t error, const char *file,
-                             unsigned int line) {
-  throw nccl_error(
-      std::string{"NCCL error encountered at: " + std::string{file} + ":" +
-                  std::to_string(line) + ": " + ncclGetErrorString(error)});
+inline void throw_nccl_error(ncclResult_t error, const char *file, unsigned int line)
+{
+  throw nccl_error(std::string{"NCCL error encountered at: " + std::string{file} + ":" +
+                               std::to_string(line) + ": " + ncclGetErrorString(error)});
 }
 
-#define NCCL_TRY(call)                                                         \
-  {                                                                            \
-    ncclResult_t nccl_status = (call);                                         \
-    if (nccl_status != ncclSuccess) {                                          \
-      throw_nccl_error(nccl_status, __FILE__, __LINE__);                       \
-    }                                                                          \
+#define NCCL_TRY(call)                                                                     \
+  {                                                                                        \
+    ncclResult_t nccl_status = (call);                                                     \
+    if (nccl_status != ncclSuccess) { throw_nccl_error(nccl_status, __FILE__, __LINE__); } \
   }
 // MPI errors are expected to be fatal before reaching this.
 // FIXME : improve when adding raft comms
-#define MPI_TRY(cmd)                                                           \
-  {                                                                            \
-    int e = cmd;                                                               \
-    if (e != MPI_SUCCESS) {                                                    \
-      CUGRAPH_FAIL("Failed: MPI error");                                       \
-    }                                                                          \
+#define MPI_TRY(cmd)                                             \
+  {                                                              \
+    int e = cmd;                                                 \
+    if (e != MPI_SUCCESS) { CUGRAPH_FAIL("Failed: MPI error"); } \
   }
 
-template <typename value_t> constexpr MPI_Datatype get_mpi_type() {
+template <typename value_t>
+constexpr MPI_Datatype get_mpi_type()
+{
   if (std::is_integral<value_t>::value) {
     if (std::is_signed<value_t>::value) {
       if (sizeof(value_t) == 1) {
@@ -91,7 +88,9 @@ template <typename value_t> constexpr MPI_Datatype get_mpi_type() {
   }
 }
 
-template <typename value_t> constexpr ncclDataType_t get_nccl_type() {
+template <typename value_t>
+constexpr ncclDataType_t get_nccl_type()
+{
   if (std::is_integral<value_t>::value) {
     if (std::is_signed<value_t>::value) {
       if (sizeof(value_t) == 1) {
@@ -123,7 +122,8 @@ template <typename value_t> constexpr ncclDataType_t get_nccl_type() {
   }
 }
 
-constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
+constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op)
+{
   if (reduce_op == ReduceOp::SUM) {
     return MPI_SUM;
   } else if (reduce_op == ReduceOp::MAX) {
@@ -135,7 +135,8 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) {
   }
 }
 
-constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
+constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op)
+{
   if (reduce_op == ReduceOp::SUM) {
     return ncclSum;
   } else if (reduce_op == ReduceOp::MAX) {
@@ -148,7 +149,8 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) {
 }
 #endif
 
-Comm::Comm(int p) : _p{p} {
+Comm::Comm(int p) : _p{p}
+{
 #if ENABLE_OPG
   // MPI
   int flag{}, mpi_world_size;
@@ -158,131 +160,120 @@ Comm::Comm(int p) : _p{p} {
   if (flag == false) {
     int provided{};
     MPI_TRY(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided));
-    if (provided != MPI_THREAD_MULTIPLE) {
-      MPI_TRY(MPI_ERR_OTHER);
-    }
+    if (provided != MPI_THREAD_MULTIPLE) { MPI_TRY(MPI_ERR_OTHER); }
     _finalize_mpi = true;
   }
 
   MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank));
   MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size));
-  CUGRAPH_EXPECTS(
-      (_p == mpi_world_size),
-      "Invalid input arguments: p should match the number of MPI processes.");
+  CUGRAPH_EXPECTS((_p == mpi_world_size),
+                  "Invalid input arguments: p should match the number of MPI processes.");
 
   _mpi_comm = MPI_COMM_WORLD;
 
   // CUDA
 
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
-  _device_id =
-      _rank %
-      _device_count; // FIXME : assumes each node has the same number of GPUs
+  _device_id = _rank % _device_count;  // FIXME : assumes each node has the same number of GPUs
   CUDA_TRY(cudaSetDevice(_device_id));
 
-  CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
-                                  cudaDevAttrMultiProcessorCount, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm,
-                                  cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                                  _device_id));
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(
+    &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
 
   // NCCL
 
   ncclUniqueId nccl_unique_id_p{};
-  if (get_rank() == 0) {
-    NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p));
-  }
-  MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0,
-                    _mpi_comm));
-  NCCL_TRY(
-      ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
+  if (get_rank() == 0) { NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); }
+  MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm));
+  NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank()));
   _finalize_nccl = true;
 #endif
 }
 
 #if ENABLE_OPG
-Comm::Comm(ncclComm_t comm, int size, int rank)
-    : _nccl_comm(comm), _p(size), _rank(rank) {
-
+Comm::Comm(ncclComm_t comm, int size, int rank) : _nccl_comm(comm), _p(size), _rank(rank)
+{
   // CUDA
   CUDA_TRY(cudaGetDeviceCount(&_device_count));
-  _device_id =
-      _rank %
-      _device_count; // FIXME : assumes each node has the same number of GPUs
-  CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if
-                                       // python takes care of this
+  _device_id = _rank % _device_count;   // FIXME : assumes each node has the same number of GPUs
+  CUDA_TRY(cudaSetDevice(_device_id));  // FIXME : check if this is needed or if
+                                        // python takes care of this
 
-  CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device,
-                                  cudaDevAttrMultiProcessorCount, _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize,
-                                  _device_id));
-  CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm,
-                                  cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                                  _device_id));
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id));
+  CUDA_TRY(cudaDeviceGetAttribute(
+    &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id));
 }
 #endif
 
-Comm::~Comm() {
+Comm::~Comm()
+{
 #if ENABLE_OPG
   // NCCL
-  if (_finalize_nccl)
-    ncclCommDestroy(_nccl_comm);
+  if (_finalize_nccl) ncclCommDestroy(_nccl_comm);
 
-  if (_finalize_mpi) {
-    MPI_Finalize();
-  }
+  if (_finalize_mpi) { MPI_Finalize(); }
 #endif
 }
 
-void Comm::barrier() {
+void Comm::barrier()
+{
 #if ENABLE_OPG
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
 }
 
 template <typename value_t>
-void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const {
+void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const
+{
 #if ENABLE_OPG
-  NCCL_TRY(ncclAllGather((const void *)sendbuff, (void *)recvbuff, size,
-                         get_nccl_type<value_t>(), _nccl_comm,
+  NCCL_TRY(ncclAllGather((const void *)sendbuff,
+                         (void *)recvbuff,
+                         size,
+                         get_nccl_type<value_t>(),
+                         _nccl_comm,
                          cudaStreamDefault));
 #endif
 }
 
 template <typename value_t>
-void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff,
-                     ReduceOp reduce_op) const {
+void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const
+{
 #if ENABLE_OPG
-  NCCL_TRY(ncclAllReduce(
-      (const void *)sendbuff, (void *)recvbuff, size, get_nccl_type<value_t>(),
-      get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));
+  NCCL_TRY(ncclAllReduce((const void *)sendbuff,
+                         (void *)recvbuff,
+                         size,
+                         get_nccl_type<value_t>(),
+                         get_nccl_reduce_op(reduce_op),
+                         _nccl_comm,
+                         cudaStreamDefault));
 #endif
 }
 
 // explicit
-template void Comm::allgather<int>(size_t size, int *sendbuff,
-                                   int *recvbuff) const;
-template void Comm::allgather<float>(size_t size, float *sendbuff,
-                                     float *recvbuff) const;
-template void Comm::allgather<double>(size_t size, double *sendbuff,
-                                      double *recvbuff) const;
-template void Comm::allreduce<int>(size_t size, int *sendbuff, int *recvbuff,
+template void Comm::allgather<int>(size_t size, int *sendbuff, int *recvbuff) const;
+template void Comm::allgather<float>(size_t size, float *sendbuff, float *recvbuff) const;
+template void Comm::allgather<double>(size_t size, double *sendbuff, double *recvbuff) const;
+template void Comm::allreduce<int>(size_t size,
+                                   int *sendbuff,
+                                   int *recvbuff,
                                    ReduceOp reduce_op) const;
-template void Comm::allreduce<float>(size_t size, float *sendbuff,
-                                     float *recvbuff, ReduceOp reduce_op) const;
-template void Comm::allreduce<double>(size_t size, double *sendbuff,
+template void Comm::allreduce<float>(size_t size,
+                                     float *sendbuff,
+                                     float *recvbuff,
+                                     ReduceOp reduce_op) const;
+template void Comm::allreduce<double>(size_t size,
+                                      double *sendbuff,
                                       double *recvbuff,
                                       ReduceOp reduce_op) const;
 
-} // namespace experimental
-} // namespace cugraph
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index fb3e8e23b89..510e58e9d0c 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -23,54 +23,57 @@
 namespace {
 
 template <typename vertex_t, typename edge_t>
-void degree_from_offsets(vertex_t number_of_vertices, edge_t const *offsets,
-                         edge_t *degree, cudaStream_t stream) {
-
+void degree_from_offsets(vertex_t number_of_vertices,
+                         edge_t const *offsets,
+                         edge_t *degree,
+                         cudaStream_t stream)
+{
   // Computes out-degree for x = 0 and x = 2
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<vertex_t>(0),
-                   thrust::make_counting_iterator<vertex_t>(number_of_vertices),
-                   [offsets, degree] __device__(vertex_t v) {
-                     degree[v] = offsets[v + 1] - offsets[v];
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream)->on(stream),
+    thrust::make_counting_iterator<vertex_t>(0),
+    thrust::make_counting_iterator<vertex_t>(number_of_vertices),
+    [offsets, degree] __device__(vertex_t v) { degree[v] = offsets[v + 1] - offsets[v]; });
 }
 
 template <typename vertex_t, typename edge_t>
 void degree_from_vertex_ids(const cugraph::experimental::Comm &comm,
-                            vertex_t number_of_vertices, edge_t number_of_edges,
-                            vertex_t const *indices, edge_t *degree,
-                            cudaStream_t stream) {
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<edge_t>(0),
-                   thrust::make_counting_iterator<edge_t>(number_of_edges),
-                   [indices, degree] __device__(edge_t e) {
-                     cugraph::atomicAdd(degree + indices[e], 1);
-                   });
-  comm.allreduce(number_of_vertices, degree, degree,
-                 cugraph::experimental::ReduceOp::SUM);
+                            vertex_t number_of_vertices,
+                            edge_t number_of_edges,
+                            vertex_t const *indices,
+                            edge_t *degree,
+                            cudaStream_t stream)
+{
+  thrust::for_each(
+    rmm::exec_policy(stream)->on(stream),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(number_of_edges),
+    [indices, degree] __device__(edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); });
+  comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM);
 }
 
-} // namespace
+}  // namespace
 
 namespace cugraph {
 namespace experimental {
 
 template <typename VT, typename ET, typename WT>
-void GraphBase<VT, ET, WT>::get_vertex_identifiers(VT *identifiers) const {
+void GraphBase<VT, ET, WT>::get_vertex_identifiers(VT *identifiers) const
+{
   cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBase<VT, ET, WT>::get_source_indices(
-    VT *src_indices) const {
+void GraphCompressedSparseBase<VT, ET, WT>::get_source_indices(VT *src_indices) const
+{
   CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified");
   cugraph::detail::offsets_to_indices<VT>(
-      offsets, GraphBase<VT, ET, WT>::number_of_vertices, src_indices);
+    offsets, GraphBase<VT, ET, WT>::number_of_vertices, src_indices);
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCOO<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const {
+void GraphCOO<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const
+{
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
   //        in then x should be modified to reflect the expected direction.
@@ -80,24 +83,30 @@ void GraphCOO<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const {
   cudaStream_t stream{nullptr};
 
   if (direction != DegreeDirection::IN) {
-    if (GraphBase<VT, ET, WT>::comm.get_p()) // FIXME retrieve global source
-                                             // indexing for the allreduce work
+    if (GraphBase<VT, ET, WT>::comm.get_p())  // FIXME retrieve global source
+                                              // indexing for the allreduce work
       CUGRAPH_FAIL("OPG degree not implemented for OUT degree");
-    degree_from_vertex_ids(
-        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
-        GraphBase<VT, ET, WT>::number_of_edges, src_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT, ET, WT>::comm,
+                           GraphBase<VT, ET, WT>::number_of_vertices,
+                           GraphBase<VT, ET, WT>::number_of_edges,
+                           src_indices,
+                           degree,
+                           stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(
-        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
-        GraphBase<VT, ET, WT>::number_of_edges, dst_indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT, ET, WT>::comm,
+                           GraphBase<VT, ET, WT>::number_of_vertices,
+                           GraphBase<VT, ET, WT>::number_of_edges,
+                           dst_indices,
+                           degree,
+                           stream);
   }
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBase<VT, ET, WT>::degree(
-    ET *degree, DegreeDirection direction) const {
+void GraphCompressedSparseBase<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const
+{
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
   //        in then x should be modified to reflect the expected direction.
@@ -108,18 +117,19 @@ void GraphCompressedSparseBase<VT, ET, WT>::degree(
 
   if (direction != DegreeDirection::IN) {
     if (GraphBase<VT, ET, WT>::comm.get_p())
-      CUGRAPH_FAIL(
-          "OPG degree not implemented for OUT degree"); // FIXME retrieve global
-                                                        // source indexing for
-                                                        // the allreduce to work
-    degree_from_offsets(GraphBase<VT, ET, WT>::number_of_vertices, offsets,
-                        degree, stream);
+      CUGRAPH_FAIL("OPG degree not implemented for OUT degree");  // FIXME retrieve global
+                                                                  // source indexing for
+                                                                  // the allreduce to work
+    degree_from_offsets(GraphBase<VT, ET, WT>::number_of_vertices, offsets, degree, stream);
   }
 
   if (direction != DegreeDirection::OUT) {
-    degree_from_vertex_ids(
-        GraphBase<VT, ET, WT>::comm, GraphBase<VT, ET, WT>::number_of_vertices,
-        GraphBase<VT, ET, WT>::number_of_edges, indices, degree, stream);
+    degree_from_vertex_ids(GraphBase<VT, ET, WT>::comm,
+                           GraphBase<VT, ET, WT>::number_of_vertices,
+                           GraphBase<VT, ET, WT>::number_of_edges,
+                           indices,
+                           degree,
+                           stream);
   }
 }
 
@@ -130,5 +140,5 @@ template class GraphCOO<int32_t, int32_t, float>;
 template class GraphCOO<int32_t, int32_t, double>;
 template class GraphCompressedSparseBase<int32_t, int32_t, float>;
 template class GraphCompressedSparseBase<int32_t, int32_t, double>;
-} // namespace experimental
-} // namespace cugraph
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu
index 619fcad7c62..7683874939c 100644
--- a/cpp/tests/nccl/degree_test.cu
+++ b/cpp/tests/nccl/degree_test.cu
@@ -14,37 +14,36 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
-#include "gtest/gtest.h"
-#include <comms_mpi.hpp>
 #include <cugraph.h>
-#include <graph.hpp>
 #include <string.h>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
+#include <comms_mpi.hpp>
+#include <graph.hpp>
+#include "gtest/gtest.h"
+#include "test_utils.h"
 
 // ref Degree on the host
 template <typename idx_t>
-void ref_degree_h(std::vector<idx_t> &ind_h, std::vector<idx_t> &degree) {
-  for (size_t i = 0; i < degree.size(); i++)
-
-    degree[i] = 0;
-  for (size_t i = 0; i < ind_h.size(); i++)
-    degree[ind_h[i]] += 1;
+void ref_degree_h(std::vector<idx_t> &ind_h, std::vector<idx_t> &degree)
+{
+  for (size_t i = 0; i < degree.size(); i++) degree[i] = 0;
+  for (size_t i = 0; i < ind_h.size(); i++) degree[ind_h[i]] += 1;
 }
 
 // global to local offsets by shifting all offsets by the first offset value
-template <typename T> void shift_by_front(std::vector<T> &v) {
+template <typename T>
+void shift_by_front(std::vector<T> &v)
+{
   auto start = v.front();
-  for (auto i = size_t{0}; i < v.size(); ++i)
-    v[i] -= start;
+  for (auto i = size_t{0}; i < v.size(); ++i) v[i] -= start;
 }
 
 // 1D partitioning such as each GPU has about the same number of edges
 template <typename T>
-void opg_edge_partioning(int r, int p, std::vector<T> &ind_h,
-                         std::vector<size_t> &part_offset, size_t &e_loc) {
-
+void opg_edge_partioning(
+  int r, int p, std::vector<T> &ind_h, std::vector<size_t> &part_offset, size_t &e_loc)
+{
   // set first and last partition offsets
   part_offset[0] = 0;
   part_offset[p] = ind_h.size();
@@ -53,7 +52,7 @@ void opg_edge_partioning(int r, int p, std::vector<T> &ind_h,
   for (int i = 1; i < p; i++) {
     // get the first vertex ID of each partition
     auto start_nnz = i * loc_nnz;
-    auto start_v = 0;
+    auto start_v   = 0;
     for (auto j = size_t{0}; j < ind_h.size(); ++j) {
       if (j >= start_nnz) {
         start_v = j;
@@ -64,11 +63,12 @@ void opg_edge_partioning(int r, int p, std::vector<T> &ind_h,
   }
   e_loc = part_offset[r + 1] - part_offset[r];
 }
-TEST(degree, success) {
+TEST(degree, success)
+{
   int v = 6;
 
   // host
-  std::vector<int> src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5},
+  std::vector<int> src_h  = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5},
                    dest_h = {1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1};
   std::vector<int> degree_h(v, 0.0), degree_ref(v, 0.0);
 
@@ -83,20 +83,17 @@ TEST(degree, success) {
   opg_edge_partioning(i, p, src_h, part_offset, e_loc);
 #ifdef OPG_VERBOSE
   sleep(i);
-  for (auto j = part_offset.begin(); j != part_offset.end(); ++j)
-    std::cout << *j << ' ';
+  for (auto j = part_offset.begin(); j != part_offset.end(); ++j) std::cout << *j << ' ';
   std::cout << std::endl;
   std::cout << "eloc: " << e_loc << std::endl;
 #endif
   std::vector<int> src_loc_h(src_h.begin() + part_offset[i],
                              src_h.begin() + part_offset[i] + e_loc),
-      dest_loc_h(dest_h.begin() + part_offset[i],
-                 dest_h.begin() + part_offset[i] + e_loc);
+    dest_loc_h(dest_h.begin() + part_offset[i], dest_h.begin() + part_offset[i] + e_loc);
   shift_by_front(src_loc_h);
 
   // print mg info
-  printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(),
-         comm.get_dev());
+  printf("#   Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev());
 
   // local device
   thrust::device_vector<int> src_d(src_loc_h.begin(), src_loc_h.end());
@@ -104,23 +101,24 @@ TEST(degree, success) {
   thrust::device_vector<int> degree_d(v);
 
   // load local chunck to cugraph
-  cugraph::experimental::GraphCOO<int, int, float> G(
-      thrust::raw_pointer_cast(src_d.data()),
-      thrust::raw_pointer_cast(dest_d.data()), nullptr, degree_h.size(), e_loc);
+  cugraph::experimental::GraphCOO<int, int, float> G(thrust::raw_pointer_cast(src_d.data()),
+                                                     thrust::raw_pointer_cast(dest_d.data()),
+                                                     nullptr,
+                                                     degree_h.size(),
+                                                     e_loc);
   G.set_communicator(comm);
 
   // OUT degree
-  G.degree(thrust::raw_pointer_cast(degree_d.data()),
-           cugraph::experimental::DegreeDirection::IN);
+  G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN);
   thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin());
   ref_degree_h(dest_h, degree_ref);
   // sleep(i);
-  for (size_t j = 0; j < degree_h.size(); ++j)
-    EXPECT_EQ(degree_ref[j], degree_h[j]);
+  for (size_t j = 0; j < degree_h.size(); ++j) EXPECT_EQ(degree_ref[j], degree_h[j]);
   std::cout << "Rank " << i << " done checking." << std::endl;
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
   testing::InitGoogleTest(&argc, argv);
   MPI_Init(&argc, &argv);
   rmmInitialize(nullptr);

From 43096887507ba16d9143207007579ee2b1b04339 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 5 May 2020 16:29:28 -0500
Subject: [PATCH 19/19] fix for header issue showing up on CI

---
 cpp/include/comms_mpi.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp
index c414a043efa..7a17bdfea4c 100644
--- a/cpp/include/comms_mpi.hpp
+++ b/cpp/include/comms_mpi.hpp
@@ -19,7 +19,7 @@
 #include <mpi.h>
 #include <nccl.h>
 #endif
-
+#include <cstddef>
 namespace cugraph {
 namespace experimental {