From 012465e4568ff762c05cd2e4082240f3d80d917a Mon Sep 17 00:00:00 2001 From: afender Date: Tue, 14 Apr 2020 17:19:11 -0500 Subject: [PATCH 01/19] Added NCCL_TRY macro for throwing throwing erros --- cpp/src/utilities/error_utils.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h index 644c29b295a..f8342c680d7 100644 --- a/cpp/src/utilities/error_utils.h +++ b/cpp/src/utilities/error_utils.h @@ -50,6 +50,14 @@ struct logic_error : public std::logic_error { struct cuda_error : public std::runtime_error { cuda_error(std::string const& message) : std::runtime_error(message) {} }; +/**---------------------------------------------------------------------------* + * @brief Exception thrown when a NCCL error is encountered. + * + *---------------------------------------------------------------------------**/ +struct nccl_error : public std::runtime_error { + nccl_error(std::string const& message) : std::runtime_error(message) {} +}; + } // namespace cugraph #define STRINGIFY_DETAIL(x) #x @@ -126,6 +134,13 @@ inline void throw_cuda_error(cudaError_t error, const char* file, cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); } +inline void throw_nccl_error(ncclResult_t error, const char* file, + unsigned int line) { + throw cugraph::nccl_error( + std::string{"NCCL error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + ncclGetErrorString(error)}); +} + inline void check_stream(cudaStream_t stream, const char* file, unsigned int line) { cudaError_t error{cudaSuccess}; @@ -208,3 +223,13 @@ inline void check_stream(cudaStream_t stream, const char* file, #define CHECK_GRAPH(graph) \ CUGRAPH_EXPECTS(graph != nullptr, "Invalid API parameter: graph is NULL"); \ CUGRAPH_EXPECTS(graph->adjList != nullptr || graph->edgeList != nullptr, "Invalid API parameter: graph is empty"); + +#define NCCL_TRY(cmd) { \ + ncclResult_t nccl_status = cmd; \ + if (nccl_status!= ncclSuccess) { \ + printf("NCCL failure %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(nccl_status)); \ + FAIL(); \ + } \ + } +} \ No newline at end of file From 8aa34bcbffeb24b4f99e2b9bddc8d326c6b7559d Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 15 Apr 2020 17:11:38 -0500 Subject: [PATCH 02/19] wip comm --- cpp/src/comms/mpi/comms_mpi.hpp | 236 ++++++++++++++++++++++++++++++++ cpp/src/structure/graph.cu | 15 +- 2 files changed, 246 insertions(+), 5 deletions(-) create mode 100644 cpp/src/comms/mpi/comms_mpi.hpp diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp new file mode 100644 index 00000000000..983fd480ad7 --- /dev/null +++ b/cpp/src/comms/mpi/comms_mpi.hpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// snmg utils +// Author: Alex Fender afender@nvidia.com + +#pragma once +#include +#include +#include +#include +#include +#include "mem_utils.h" +#include "basic_kernels.cuh" + +#define USE_NCCL 1 + +namespace cugraph { +namespace opg { + +template +constexpr MPI_Datatype get_mpi_type() { + if (std::is_integral::value) { + if (std::is_signed::value) { + if (sizeof(value_t) == 1) { + return MPI_INT8_T; + } + else if (sizeof(value_t) == 2) { + return MPI_INT16_T; + } + else if (sizeof(value_t) == 4) { + return MPI_INT32_T; + } + else if (sizeof(value_t) == 8) { + return MPI_INT64_T; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + else { + if (sizeof(value_t) == 1) { + return MPI_UINT8_T; + } + else if (sizeof(value_t) == 2) { + return MPI_UINT16_T; + } + else if (sizeof(value_t) == 4) { + return MPI_UINT32_T; + } + else if (sizeof(value_t) == 8) { + return MPI_UINT64_T; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + } + else if(std::is_same::value) { + return MPI_FLOAT; + } + else if(std::is_same::value) { + return MPI_DOUBLE; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} +#if USE_NCCL +template +constexpr ncclDataType_t get_nccl_type() { + if (std::is_integral::value) { + if (std::is_signed::value) { + if (sizeof(value_t) == 1) { + return ncclInt8; + } + else if (sizeof(value_t) == 4) { + return ncclInt32; + } + else if (sizeof(value_t) == 8) { + return ncclInt64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + else { + if (sizeof(value_t) == 1) { + return ncclUint8; + } + else if (sizeof(value_t) == 4) { + return ncclUint32; + } + else if (sizeof(value_t) == 8) { + return ncclUint64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + } + else if(std::is_same::value) { + return ncclFloat32; + } + else if(std::is_same::value) { + return ncclFloat64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} +#endif +enum class ReduceOp { SUM, MAX, MIN }; + +constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { + if (reduce_op == ReduceOp::SUM) { + return MPI_SUM; + } + else if (reduce_op == ReduceOp::MAX) { + return MPI_MAX; + } + else if (reduce_op == ReduceOp::MIN) { + return MPI_MIN; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} + +#if USE_NCCL +constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { + if (reduce_op == ReduceOp::SUM) { + return ncclSum; + } + else if (reduce_op == ReduceOp::MAX) { + return ncclMax; + } + else if (reduce_op == ReduceOp::MIN) { + return ncclMin; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} +#endif + +// basic info about the snmg env setup +class Comm +{ + private: + int _p_x{0}; + int _p_y{0}; + + int _mpi_world_rank{0}; + int _mpi_world_size{0}; + bool _finalize_mpi{false}; + + int _device_id{0}; + int _device_count{0}; + + std::vector _p_ipc_mems{}; + std::vector _local_ipc_mem_offsets{}; + + int _sm_count_per_device{0}; + int _max_grid_dim_1D{0}; + int _max_block_dim_1D{0}; + int _l2_cache_size{0}; + int _shared_memory_size_per_sm{0}; + int _cuda_stream_least_priority{0}; + int _cuda_stream_greatest_priority{0}; + + MPI_Comm _mpi_comm_p_x{}; + MPI_Comm _mpi_comm_p_y{}; + MPI_Comm _mpi_comm_p{}; + + cudaStream_t _default_stream{}; + std::vector _extra_streams{}; + + ncclComm_t _nccl_comm{}; + + public: + Comm(); + ~Comm(); + int get_rank() const { return _mpi_world_rank; } + int get_p() const { return _mpi_world_size; } + int get_dev() const { return _device_id; } + int get_dev_count() const { return _device_count; } + int get_sm_count() const { return _sm_count_per_device; } + bool is_master() const return { return (_mpi_world_rank == 0)? true : false; } + void init(); + + template + void allgather (size_t size, val_t* sendbuff, val_t* recvbuff); + + template + void allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op); + +}; + +// Wait for all host threads +void sync_all() { + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); +} + +template +void Comm::allgather (size_t size, val_t* sendbuff, val_t* recvbuff) { +#if USE_NCCL + if(typeid(val_t) == typeid(float)) + NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); + else + CUGRAPH_FAIL("allgather needs floats"); +#endif +} + +template +void Comm::allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op) { +#if USE_NCCL + NCCL_TRY(ncclAllReduce(const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));); +#endif +} + +} } //namespace diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 883b35041c4..a8d7082f0ca 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -32,10 +32,12 @@ void degree_from_offsets(vertex_t number_of_vertices, } template -void degree_from_vertex_ids(edge_t number_of_edges, +void degree_from_vertex_ids(vertex_t number_of_vertices, + edge_t number_of_edges, vertex_t const *indices, edge_t *degree, - cudaStream_t stream) { + cudaStream_t stream, + cugraph::Comm env = 0) { thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), @@ -43,6 +45,9 @@ void degree_from_vertex_ids(edge_t number_of_edges, [indices, degree] __device__ (edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); + comm.allreduce(cugraph::Communicator::P_X, cugraph::Target::DEVICE, + degree, degree, d_out_degrees.size(), + cugraph::ReduceOp::SUM, env.get_default_cuda_stream()); } } //namespace anonymous @@ -72,11 +77,11 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - degree_from_vertex_ids(GraphBase::number_of_edges, src_indices, degree, stream); + degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::number_of_edges, dst_indices, degree, stream); + degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, dst_indices, degree, stream); } } @@ -95,7 +100,7 @@ void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection dir } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::number_of_edges, indices, degree, stream); + degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, indices, degree, stream); } } From b23b5e5f2377933fa60e216ccbca1c0f0bee64bc Mon Sep 17 00:00:00 2001 From: afender Date: Thu, 23 Apr 2020 18:24:10 -0500 Subject: [PATCH 03/19] checkpoint --- cpp/CMakeLists.txt | 1 + cpp/include/graph.hpp | 7 +- cpp/src/comms/mpi/comms_mpi.cpp | 98 ++++++++++++++++++++++++ cpp/src/comms/mpi/comms_mpi.hpp | 128 +++++++++++++++++++------------- cpp/src/structure/graph.cu | 19 ++--- cpp/src/utilities/error_utils.h | 24 ------ 6 files changed, 192 insertions(+), 85 deletions(-) create mode 100644 cpp/src/comms/mpi/comms_mpi.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6c81f03d387..db1dda9cfcf 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -330,6 +330,7 @@ link_directories( "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") add_library(cugraph SHARED + src/comms/mpi/comms_mpi.cpp src/ktruss/ktruss.cu src/db/db_object.cu src/db/db_parser_integration_test.cu diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index 8b7a163239e..3838fe3dc92 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once - +#include "comms/mpi/comms_mpi.hpp" namespace cugraph { namespace experimental { @@ -47,8 +47,8 @@ enum class DegreeDirection { template class GraphBase { public: + Comm comm; WT *edge_data; ///< edge weight - GraphProperties prop; VT number_of_vertices; @@ -61,8 +61,11 @@ class GraphBase { */ void get_vertex_identifiers(VT *identifiers) const; + void setCommunicator(Comm& comm_) {comm = comm_;} + GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_): edge_data(edge_data_), + comm(0), prop(), number_of_vertices(number_of_vertices_), number_of_edges(number_of_edges_) diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp new file mode 100644 index 00000000000..1f561cb0ea7 --- /dev/null +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "comms/mpi/comms_mpi.hpp" + +#include +#include + +namespace cugraph { +namespace experimental { + +Comm::Comm(int p) : _p{p} { +#if USE_NCCL + // MPI + int flag{}; + + MPI_TRY(MPI_Initialized(&flag)); + + if (flag == false) { + int provided{}; + MPI_TRY(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided)); + if (provided != MPI_THREAD_MULTIPLE) { + MPI_TRY(MPI_ERR_OTHER); + } + _finalize_mpi = true; + } + + MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank)); + MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size)); + CUGRAPH_EXPECTS( + _p == _mpi_world_size, + "Invalid input arguments: p should match the number of MPI processes."); + + _mpi_comm = MPI_COMM_WORLD; + + // CUDA + + CUDA_TRY(cudaGetDeviceCount(&_device_count)); + _device_id = _mpi_world_rank % _device_count; + CUDA_TRY(cudaSetDevice(_device_id)); + + CUDA_TRY( + cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); + CUDA_TRY( + cudaDeviceGetAttribute( + &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); + int supported{0}; + CUDA_TRY(cudaDeviceGetAttribute(&supported, cudaDevAttrStreamPrioritiesSupported, _device_id)); + CUDA_TRY(cudaDeviceGetStreamPriorityRange(&_cuda_stream_least_priority, &_cuda_stream_greatest_priority)); + + CUDA_TRY(cudaStreamCreate(&_default_stream)); + + // NCCL + + ncclUniqueId nccl_unique_id_p{}; + if (get_rank() == 0) { + NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); + } + MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm)); + + NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); +#endif + +} + +Comm::~Comm() { +#if USE_NCCL + // NCCL + ncclCommDestroy(_nccl_comm); + + // CUDA + for (auto& stream : _extra_streams) { + cudaStreamDestroy(stream); + } + cudaStreamDestroy(_default_stream); + + if (_finalize_mpi) { + MPI_Finalize(); + } +#endif +} +} }//namespace diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp index 983fd480ad7..1e80c2285f8 100644 --- a/cpp/src/comms/mpi/comms_mpi.hpp +++ b/cpp/src/comms/mpi/comms_mpi.hpp @@ -14,22 +14,55 @@ * limitations under the License. */ -// snmg utils -// Author: Alex Fender afender@nvidia.com - + #pragma once + +#define USE_NCCL 1 + +#if USE_NCCL #include #include +#endif + #include #include #include -#include "mem_utils.h" -#include "basic_kernels.cuh" - -#define USE_NCCL 1 +#include "utilities/error_utils.h" namespace cugraph { -namespace opg { +namespace experimental { + +/**---------------------------------------------------------------------------* + * @brief Exception thrown when a NCCL error is encountered. + * + *---------------------------------------------------------------------------**/ +struct nccl_error : public std::runtime_error { + nccl_error(std::string const& message) : std::runtime_error(message) {} +}; + +inline void throw_nccl_error(ncclResult_t error, const char* file, + unsigned int line) { + throw nccl_error( + std::string{"NCCL error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + ncclGetErrorString(error)}); +} + +#if USE_NCCL +#define NCCL_TRY(call) { \ + ncclResult_t nccl_status = (call); \ + if (nccl_status!= ncclSuccess) { \ + throw_nccl_error(nccl_status, __FILE__, __LINE__); \ + } \ +} + +// MPI errors are expected to be fatal before reaching this. +// Fix me : improve when adding raft comms +#define MPI_TRY(cmd) { \ + int e = cmd; \ + if ( e != MPI_SUCCESS ) { \ + CUGRAPH_FAIL("Failed: MPI error"); \ + } \ +} template constexpr MPI_Datatype get_mpi_type() { @@ -79,7 +112,7 @@ constexpr MPI_Datatype get_mpi_type() { CUGRAPH_FAIL("unsupported type"); } } -#if USE_NCCL + template constexpr ncclDataType_t get_nccl_type() { if (std::is_integral::value) { @@ -122,7 +155,7 @@ constexpr ncclDataType_t get_nccl_type() { CUGRAPH_FAIL("unsupported type"); } } -#endif + enum class ReduceOp { SUM, MAX, MIN }; constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { @@ -140,7 +173,6 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { } } -#if USE_NCCL constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { if (reduce_op == ReduceOp::SUM) { return ncclSum; @@ -161,75 +193,71 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { class Comm { private: - int _p_x{0}; - int _p_y{0}; - - int _mpi_world_rank{0}; - int _mpi_world_size{0}; - bool _finalize_mpi{false}; + int _p{0}; - int _device_id{0}; - int _device_count{0}; + int _mpi_world_rank{0}; + int _mpi_world_size{0}; + bool _finalize_mpi{false}; - std::vector _p_ipc_mems{}; - std::vector _local_ipc_mem_offsets{}; + int _device_id{0}; + int _device_count{0}; - int _sm_count_per_device{0}; - int _max_grid_dim_1D{0}; - int _max_block_dim_1D{0}; - int _l2_cache_size{0}; - int _shared_memory_size_per_sm{0}; - int _cuda_stream_least_priority{0}; - int _cuda_stream_greatest_priority{0}; + std::vector _p_ipc_mems{}; + std::vector _local_ipc_mem_offsets{}; - MPI_Comm _mpi_comm_p_x{}; - MPI_Comm _mpi_comm_p_y{}; - MPI_Comm _mpi_comm_p{}; + int _sm_count_per_device{0}; + int _max_grid_dim_1D{0}; + int _max_block_dim_1D{0}; + int _l2_cache_size{0}; + int _shared_memory_size_per_sm{0}; + int _cuda_stream_least_priority{0}; + int _cuda_stream_greatest_priority{0}; - cudaStream_t _default_stream{}; - std::vector _extra_streams{}; + cudaStream_t _default_stream{}; + std::vector _extra_streams{}; - ncclComm_t _nccl_comm{}; - +#if USE_NCCL + MPI_Comm _mpi_comm{}; + ncclComm_t _nccl_comm{}; + #endif + public: - Comm(); + Comm(int p); ~Comm(); int get_rank() const { return _mpi_world_rank; } int get_p() const { return _mpi_world_size; } int get_dev() const { return _device_id; } int get_dev_count() const { return _device_count; } int get_sm_count() const { return _sm_count_per_device; } - bool is_master() const return { return (_mpi_world_rank == 0)? true : false; } - void init(); + bool is_master() const { return (_mpi_world_rank == 0)? true : false; } - template - void allgather (size_t size, val_t* sendbuff, val_t* recvbuff); + template + void allgather (size_t size, value_t* sendbuff, value_t* recvbuff); - template - void allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op); + template + void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op); }; // Wait for all host threads void sync_all() { cudaDeviceSynchronize(); +#if USE_NCCL MPI_Barrier(MPI_COMM_WORLD); +#endif } -template -void Comm::allgather (size_t size, val_t* sendbuff, val_t* recvbuff) { +template +void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) { #if USE_NCCL - if(typeid(val_t) == typeid(float)) NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); - else - CUGRAPH_FAIL("allgather needs floats"); #endif } -template -void Comm::allreduce (size_t size, val_t* sendbuff, val_t* recvbuff, ReduceOp reduce_op) { +template +void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) { #if USE_NCCL - NCCL_TRY(ncclAllReduce(const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault));); + NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); #endif } diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index a8d7082f0ca..2a27faa6236 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -14,6 +14,8 @@ #include "utilities/error_utils.h" #include "utilities/cuda_utils.cuh" + + namespace { template @@ -32,12 +34,12 @@ void degree_from_offsets(vertex_t number_of_vertices, } template -void degree_from_vertex_ids(vertex_t number_of_vertices, +void degree_from_vertex_ids(cugraph::experimental::Comm& comm, + vertex_t number_of_vertices, edge_t number_of_edges, vertex_t const *indices, edge_t *degree, - cudaStream_t stream, - cugraph::Comm env = 0) { + cudaStream_t stream) { thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), @@ -45,9 +47,7 @@ void degree_from_vertex_ids(vertex_t number_of_vertices, [indices, degree] __device__ (edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); - comm.allreduce(cugraph::Communicator::P_X, cugraph::Target::DEVICE, - degree, degree, d_out_degrees.size(), - cugraph::ReduceOp::SUM, env.get_default_cuda_stream()); + comm.allreduce(degree, degree, number_of_vertices, cugraph::ReduceOp::SUM); } } //namespace anonymous @@ -55,6 +55,7 @@ void degree_from_vertex_ids(vertex_t number_of_vertices, namespace cugraph { namespace experimental { + template void GraphBase::get_vertex_identifiers(VT *identifiers) const { cugraph::detail::sequence(number_of_vertices, identifiers); @@ -77,11 +78,11 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, dst_indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, dst_indices, degree, stream); } } @@ -100,7 +101,7 @@ void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection dir } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::number_of_vertices, GraphBase::number_of_edges, indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, indices, degree, stream); } } diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h index f8342c680d7..f18716a3a34 100644 --- a/cpp/src/utilities/error_utils.h +++ b/cpp/src/utilities/error_utils.h @@ -50,14 +50,6 @@ struct logic_error : public std::logic_error { struct cuda_error : public std::runtime_error { cuda_error(std::string const& message) : std::runtime_error(message) {} }; -/**---------------------------------------------------------------------------* - * @brief Exception thrown when a NCCL error is encountered. - * - *---------------------------------------------------------------------------**/ -struct nccl_error : public std::runtime_error { - nccl_error(std::string const& message) : std::runtime_error(message) {} -}; - } // namespace cugraph #define STRINGIFY_DETAIL(x) #x @@ -134,13 +126,6 @@ inline void throw_cuda_error(cudaError_t error, const char* file, cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); } -inline void throw_nccl_error(ncclResult_t error, const char* file, - unsigned int line) { - throw cugraph::nccl_error( - std::string{"NCCL error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + ncclGetErrorString(error)}); -} - inline void check_stream(cudaStream_t stream, const char* file, unsigned int line) { cudaError_t error{cudaSuccess}; @@ -224,12 +209,3 @@ inline void check_stream(cudaStream_t stream, const char* file, CUGRAPH_EXPECTS(graph != nullptr, "Invalid API parameter: graph is NULL"); \ CUGRAPH_EXPECTS(graph->adjList != nullptr || graph->edgeList != nullptr, "Invalid API parameter: graph is empty"); -#define NCCL_TRY(cmd) { \ - ncclResult_t nccl_status = cmd; \ - if (nccl_status!= ncclSuccess) { \ - printf("NCCL failure %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(nccl_status)); \ - FAIL(); \ - } \ - } -} \ No newline at end of file From a89328ec582225d830a6e739d016f2556e4357ad Mon Sep 17 00:00:00 2001 From: afender Date: Fri, 24 Apr 2020 17:15:19 -0500 Subject: [PATCH 04/19] builds --- cpp/src/comms/mpi/comms_mpi.cpp | 7 +++++++ cpp/src/comms/mpi/comms_mpi.hpp | 18 ++++++------------ cpp/src/structure/graph.cu | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 1f561cb0ea7..67ed76d36bc 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -95,4 +95,11 @@ Comm::~Comm() { } #endif } + +void Comm::barrier() { + cudaDeviceSynchronize(); +#if USE_NCCL + MPI_Barrier(MPI_COMM_WORLD); +#endif +} } }//namespace diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp index 1e80c2285f8..22afc234b8d 100644 --- a/cpp/src/comms/mpi/comms_mpi.hpp +++ b/cpp/src/comms/mpi/comms_mpi.hpp @@ -231,31 +231,25 @@ class Comm int get_sm_count() const { return _sm_count_per_device; } bool is_master() const { return (_mpi_world_rank == 0)? true : false; } + void barrier(); + template - void allgather (size_t size, value_t* sendbuff, value_t* recvbuff); + void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const; template - void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op); + void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const; }; -// Wait for all host threads -void sync_all() { - cudaDeviceSynchronize(); -#if USE_NCCL - MPI_Barrier(MPI_COMM_WORLD); -#endif -} - template -void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) { +void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const { #if USE_NCCL NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); #endif } template -void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) { +void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const { #if USE_NCCL NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); #endif diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 2a27faa6236..391c6538be2 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -34,7 +34,7 @@ void degree_from_offsets(vertex_t number_of_vertices, } template -void degree_from_vertex_ids(cugraph::experimental::Comm& comm, +void degree_from_vertex_ids(const cugraph::experimental::Comm& comm, vertex_t number_of_vertices, edge_t number_of_edges, vertex_t const *indices, @@ -47,7 +47,7 @@ void degree_from_vertex_ids(cugraph::experimental::Comm& comm, [indices, degree] __device__ (edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); - comm.allreduce(degree, degree, number_of_vertices, cugraph::ReduceOp::SUM); + comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM); } } //namespace anonymous From f5bc95969fb95830e1be5c8e91710e1a53d4c332 Mon Sep 17 00:00:00 2001 From: afender Date: Fri, 24 Apr 2020 18:15:59 -0500 Subject: [PATCH 05/19] test checkpoint --- cpp/include/graph.hpp | 2 +- cpp/tests/CMakeLists.txt | 5 +++ cpp/tests/nccl/degree_test.cu | 74 +++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 cpp/tests/nccl/degree_test.cu diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index 3838fe3dc92..0cb70093b01 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -61,7 +61,7 @@ class GraphBase { */ void get_vertex_identifiers(VT *identifiers) const; - void setCommunicator(Comm& comm_) {comm = comm_;} + void set_communicator(Comm& comm_) {comm = comm_;} GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_): edge_data(edge_data_), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8c850924730..5b37fea735f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -285,6 +285,11 @@ if (BUILD_MPI) "${CMAKE_CURRENT_SOURCE_DIR}/nccl/nccl_test.cu") ConfigureTest(NCCL_TEST "${NCCL_TEST_SRC}" "") + + set(NCCL_DEGREE_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/nccl/degree_test.cu") + + ConfigureTest(NCCL_DEGREE_TEST "${NCCL_DEGREE_TEST_SRC}" "") endif(BUILD_MPI) ################################################################################################### diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu new file mode 100644 index 00000000000..2397a487d18 --- /dev/null +++ b/cpp/tests/nccl/degree_test.cu @@ -0,0 +1,74 @@ +#include "gtest/gtest.h" +#include +#include "test_utils.h" +#include +#include +#include +#include +#include +#include +#include "comms/mpi/comms_mpi.hpp" + +// ref Degree on the host +template +void ref_degree_h(std::vector & ind_h, + std::vector & degree) { + for (size_t i = 0; i < degree.size(); i++) + degree[i] = 0; + for (size_t i = 0; i < ind_h.size(); i++) + degree[ind_h[i]] += 1; +} + +TEST(degree, success) +{ + int v = 6; + + //host + std::vector src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5}, + dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3}; + std::vector degree_h(v, 0.0), degree_ref(v, 0.0); + + //device + thrust::device_vector src_d(src_h.begin(), src_h.begin()+src_h.size()); + thrust::device_vector dest_d(dest_h.begin(), dest_h.begin()+dest_h.size()); + thrust::device_vector degree_d(v); + + //MG + int p; + MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p)); + cugraph::experimental::Comm comm(p); + + // print mg info + printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev()); + + // load cugraph (fix me : split per process) + cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), + thrust::raw_pointer_cast(dest_d.data()), + nullptr, degree_h.size(), dest_h.size()); + G.set_communicator(comm); + + // IN degree + G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); + thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); + ref_degree_h(dest_h, degree_ref); + for (size_t j = 0; j < degree_h.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); + + // OUT degree + G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT); + thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); + ref_degree_h(src_h, degree_ref); + for (size_t j = 0; j < degree_h.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); +} + +int main( int argc, char** argv ) +{ + testing::InitGoogleTest(&argc,argv); + MPI_Init(&argc, &argv); + rmmInitialize(nullptr); + int rc = RUN_ALL_TESTS(); + rmmFinalize(); + MPI_Finalize(); + return rc; +} \ No newline at end of file From dd29ec99fd4aee7f2da5e8176c8e228ff7974f06 Mon Sep 17 00:00:00 2001 From: afender Date: Mon, 27 Apr 2020 19:15:19 -0500 Subject: [PATCH 06/19] checkpoint np 1 passes --- cpp/include/graph.hpp | 2 +- cpp/src/comms/mpi/comms_mpi.cpp | 21 +++++---------------- cpp/src/comms/mpi/comms_mpi.hpp | 8 +++----- cpp/tests/nccl/degree_test.cu | 4 ++++ 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index 0cb70093b01..ee8d6e95fc0 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -65,7 +65,7 @@ class GraphBase { GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_): edge_data(edge_data_), - comm(0), + comm(), prop(), number_of_vertices(number_of_vertices_), number_of_edges(number_of_edges_) diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 67ed76d36bc..167594a783c 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -40,9 +40,8 @@ Comm::Comm(int p) : _p{p} { MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank)); MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size)); - CUGRAPH_EXPECTS( - _p == _mpi_world_size, - "Invalid input arguments: p should match the number of MPI processes."); + CUGRAPH_EXPECTS( (_p == _mpi_world_size), + "Invalid input arguments: p should match the number of MPI processes."); _mpi_comm = MPI_COMM_WORLD; @@ -60,11 +59,6 @@ Comm::Comm(int p) : _p{p} { CUDA_TRY( cudaDeviceGetAttribute( &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); - int supported{0}; - CUDA_TRY(cudaDeviceGetAttribute(&supported, cudaDevAttrStreamPrioritiesSupported, _device_id)); - CUDA_TRY(cudaDeviceGetStreamPriorityRange(&_cuda_stream_least_priority, &_cuda_stream_greatest_priority)); - - CUDA_TRY(cudaStreamCreate(&_default_stream)); // NCCL @@ -73,8 +67,8 @@ Comm::Comm(int p) : _p{p} { NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); } MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm)); - NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); + _finalize_nccl = true; #endif } @@ -82,13 +76,8 @@ Comm::Comm(int p) : _p{p} { Comm::~Comm() { #if USE_NCCL // NCCL - ncclCommDestroy(_nccl_comm); - - // CUDA - for (auto& stream : _extra_streams) { - cudaStreamDestroy(stream); - } - cudaStreamDestroy(_default_stream); + if (_finalize_nccl) + ncclCommDestroy(_nccl_comm); if (_finalize_mpi) { MPI_Finalize(); diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp index 22afc234b8d..3521c9abae7 100644 --- a/cpp/src/comms/mpi/comms_mpi.hpp +++ b/cpp/src/comms/mpi/comms_mpi.hpp @@ -198,6 +198,8 @@ class Comm int _mpi_world_rank{0}; int _mpi_world_size{0}; bool _finalize_mpi{false}; + bool _finalize_nccl{false}; + int _device_id{0}; int _device_count{0}; @@ -210,11 +212,6 @@ class Comm int _max_block_dim_1D{0}; int _l2_cache_size{0}; int _shared_memory_size_per_sm{0}; - int _cuda_stream_least_priority{0}; - int _cuda_stream_greatest_priority{0}; - - cudaStream_t _default_stream{}; - std::vector _extra_streams{}; #if USE_NCCL MPI_Comm _mpi_comm{}; @@ -222,6 +219,7 @@ class Comm #endif public: + Comm(){}; Comm(int p); ~Comm(); int get_rank() const { return _mpi_world_rank; } diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 2397a487d18..bbb5a006eb6 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -49,10 +49,14 @@ TEST(degree, success) // IN degree G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); + + std::cout<< "passed"< Date: Tue, 28 Apr 2020 18:16:25 -0500 Subject: [PATCH 07/19] added edge list partitioning of test input and fixes --- cpp/src/structure/graph.cu | 1 + cpp/tests/nccl/degree_test.cu | 75 ++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 391c6538be2..1cfe6d56e2a 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -47,6 +47,7 @@ void degree_from_vertex_ids(const cugraph::experimental::Comm& comm, [indices, degree] __device__ (edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); + std::cout<< number_of_vertices<<" "<< number_of_edges< & ind_h, degree[ind_h[i]] += 1; } +// global to local offsets by shifting all offsets by the first offset value +template +void shift_by_front(std::vector & v) { + auto start = v.front(); + for (auto i = size_t{0}; i < v.size(); ++i) + v[i] -= start; +} + +// 1D partitioning such as each GPU has about the same number of edges +template +void opg_edge_partioning(int r, int p, std::vector & ind_h, std::vector & part_offset, size_t & e_loc) { + + //set first and last partition offsets + part_offset[0] = 0; + part_offset[p] = ind_h.size(); + //part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end())); + auto loc_nnz = ind_h.size()/p; + for (int i=1; i= start_nnz) { + start_v = j; + break; + } + } + part_offset[i] = start_v; + } + e_loc = part_offset[r+1] - part_offset[r]; +} TEST(degree, success) { int v = 6; //host - std::vector src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5}, - dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3}; + std::vector src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, + dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1}; std::vector degree_h(v, 0.0), degree_ref(v, 0.0); - //device - thrust::device_vector src_d(src_h.begin(), src_h.begin()+src_h.size()); - thrust::device_vector dest_d(dest_h.begin(), dest_h.begin()+dest_h.size()); - thrust::device_vector degree_d(v); + //MG int p; MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p)); cugraph::experimental::Comm comm(p); + std::vector part_offset(p + 1); + auto i = comm.get_rank(); + size_t e_loc; + + opg_edge_partioning(i, p, src_h, part_offset, e_loc); + sleep(i); + for (auto j = part_offset.begin(); j != part_offset.end(); ++j) + std::cout << *j << ' '; + std::cout << std::endl; + std::cout<< "eloc: "<< e_loc < src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc), + dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc); + shift_by_front(src_loc_h); + // print mg info printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev()); + //local device + thrust::device_vector src_d(src_loc_h.begin(), src_loc_h.end()); + thrust::device_vector dest_d(dest_loc_h.begin(), dest_loc_h.end()); + thrust::device_vector degree_d(v); + // load cugraph (fix me : split per process) cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), thrust::raw_pointer_cast(dest_d.data()), - nullptr, degree_h.size(), dest_h.size()); + nullptr, degree_h.size(), e_loc); G.set_communicator(comm); - // IN degree - G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); - - std::cout<< "passed"< Date: Tue, 28 Apr 2020 18:32:38 -0500 Subject: [PATCH 08/19] more fixes and cleanup --- cpp/src/structure/graph.cu | 5 ++++- cpp/tests/nccl/degree_test.cu | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 1cfe6d56e2a..d0ade029462 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -47,7 +47,6 @@ void degree_from_vertex_ids(const cugraph::experimental::Comm& comm, [indices, degree] __device__ (edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); - std::cout<< number_of_vertices<<" "<< number_of_edges<::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { + if (GraphBase::comm.get_p()); // FixMe retrieve global source indexing for the allreduce work + CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); } @@ -98,6 +99,8 @@ void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection dir cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { + if (GraphBase::comm.get_p()); + CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work degree_from_offsets(GraphBase::number_of_vertices, offsets, degree, stream); } diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 15b0a751520..3b44ed1ce86 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -70,12 +70,13 @@ TEST(degree, success) size_t e_loc; opg_edge_partioning(i, p, src_h, part_offset, e_loc); + #ifdef OPG_VERBOSE sleep(i); for (auto j = part_offset.begin(); j != part_offset.end(); ++j) std::cout << *j << ' '; std::cout << std::endl; std::cout<< "eloc: "<< e_loc < src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc), dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc); shift_by_front(src_loc_h); @@ -96,10 +97,10 @@ TEST(degree, success) G.set_communicator(comm); // OUT degree - G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::OUT); + G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); - ref_degree_h(src_h, degree_ref); - sleep(i); + ref_degree_h(dest_h, degree_ref); + //sleep(i); for (size_t j = 0; j < degree_h.size(); ++j) EXPECT_EQ(degree_ref[j], degree_h[j]); std::cout<< "Rank "<< i << " done checking." < Date: Tue, 28 Apr 2020 18:39:51 -0500 Subject: [PATCH 09/19] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index afe42a6a3b7..3c99b94d2bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # cuGraph 0.14.0 (Date TBD) ## New Features +- PR #840 OPG degree ## Improvements - PR #764 Updated sssp and bfs with GraphCSR, removed gdf_column, added nullptr weights test for sssp From 18094e1f5ef5d6f2ef65347f19b57e7423225b04 Mon Sep 17 00:00:00 2001 From: Alex Fender Date: Tue, 28 Apr 2020 18:42:17 -0500 Subject: [PATCH 10/19] fixed comment --- cpp/tests/nccl/degree_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 3b44ed1ce86..83910e73c24 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -90,7 +90,7 @@ TEST(degree, success) thrust::device_vector dest_d(dest_loc_h.begin(), dest_loc_h.end()); thrust::device_vector degree_d(v); - // load cugraph (fix me : split per process) + // load local chunck to cugraph cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), thrust::raw_pointer_cast(dest_d.data()), nullptr, degree_h.size(), e_loc); @@ -115,4 +115,4 @@ int main( int argc, char** argv ) rmmFinalize(); MPI_Finalize(); return rc; -} \ No newline at end of file +} From 008327f13d4fb77d39d56a8fe4454507037fc8a0 Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 29 Apr 2020 12:51:56 -0500 Subject: [PATCH 11/19] fixes --- cpp/CMakeLists.txt | 4 ++++ cpp/src/comms/mpi/comms_mpi.cpp | 1 - cpp/src/comms/mpi/comms_mpi.hpp | 8 +++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index db1dda9cfcf..6031dc8ccef 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -398,6 +398,10 @@ add_library(cugraph SHARED # add_dependencies(cugraph cugunrock) +if (BUILD_MPI) + add_compile_definitions(USE_NCCL=1) +endif (BUILD_MPI) + ################################################################################################### # - include paths --------------------------------------------------------------------------------- target_include_directories(cugraph diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 167594a783c..24112048509 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -86,7 +86,6 @@ Comm::~Comm() { } void Comm::barrier() { - cudaDeviceSynchronize(); #if USE_NCCL MPI_Barrier(MPI_COMM_WORLD); #endif diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp index 3521c9abae7..9b1ca8c3126 100644 --- a/cpp/src/comms/mpi/comms_mpi.hpp +++ b/cpp/src/comms/mpi/comms_mpi.hpp @@ -17,8 +17,6 @@ #pragma once -#define USE_NCCL 1 - #if USE_NCCL #include #include @@ -32,6 +30,9 @@ namespace cugraph { namespace experimental { +enum class ReduceOp { SUM, MAX, MIN }; + +#if USE_NCCL /**---------------------------------------------------------------------------* * @brief Exception thrown when a NCCL error is encountered. * @@ -47,7 +48,6 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, std::to_string(line) + ": " + ncclGetErrorString(error)}); } -#if USE_NCCL #define NCCL_TRY(call) { \ ncclResult_t nccl_status = (call); \ if (nccl_status!= ncclSuccess) { \ @@ -156,8 +156,6 @@ constexpr ncclDataType_t get_nccl_type() { } } -enum class ReduceOp { SUM, MAX, MIN }; - constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { if (reduce_op == ReduceOp::SUM) { return MPI_SUM; From 40b448f3cbfce1becd271106b15389de3715acce Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 29 Apr 2020 15:52:00 -0500 Subject: [PATCH 12/19] non-mpi path --- cpp/tests/nccl/degree_test.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 83910e73c24..1c7221076d4 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -2,8 +2,6 @@ #include #include "test_utils.h" #include -#include -#include #include #include #include From 1c6b26720bbf39ad03ad0f7a67783d0ab0d5aa5c Mon Sep 17 00:00:00 2001 From: afender Date: Thu, 30 Apr 2020 17:01:18 -0500 Subject: [PATCH 13/19] headers reorg for comms deployment --- cpp/include/comms_mpi.hpp | 77 ++++++++++ cpp/include/graph.hpp | 2 +- cpp/src/comms/mpi/comms_mpi.cpp | 182 ++++++++++++++++++++++- cpp/src/comms/mpi/comms_mpi.hpp | 254 -------------------------------- cpp/tests/nccl/degree_test.cu | 2 +- 5 files changed, 259 insertions(+), 258 deletions(-) create mode 100644 cpp/include/comms_mpi.hpp delete mode 100644 cpp/src/comms/mpi/comms_mpi.hpp diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp new file mode 100644 index 00000000000..c6cb5339f54 --- /dev/null +++ b/cpp/include/comms_mpi.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#if USE_NCCL +#include +#include +#endif + +namespace cugraph { +namespace experimental { + +enum class ReduceOp { SUM, MAX, MIN }; + +// basic info about the snmg env setup +class Comm +{ + private: + int _p{0}; + + int _mpi_world_rank{0}; + int _mpi_world_size{0}; + bool _finalize_mpi{false}; + bool _finalize_nccl{false}; + + + int _device_id{0}; + int _device_count{0}; + + int _sm_count_per_device{0}; + int _max_grid_dim_1D{0}; + int _max_block_dim_1D{0}; + int _l2_cache_size{0}; + int _shared_memory_size_per_sm{0}; + +#if USE_NCCL + MPI_Comm _mpi_comm{}; + ncclComm_t _nccl_comm{}; + #endif + + public: + Comm(){}; + Comm(int p); + ~Comm(); + int get_rank() const { return _mpi_world_rank; } + int get_p() const { return _mpi_world_size; } + int get_dev() const { return _device_id; } + int get_dev_count() const { return _device_count; } + int get_sm_count() const { return _sm_count_per_device; } + bool is_master() const { return (_mpi_world_rank == 0)? true : false; } + + void barrier(); + + template + void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const; + + template + void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const; + +}; + +} } //namespace diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index ee8d6e95fc0..aac5e9116a1 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once -#include "comms/mpi/comms_mpi.hpp" +#include namespace cugraph { namespace experimental { diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 24112048509..6a1846ee35b 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -14,13 +14,168 @@ * limitations under the License. */ -#include "comms/mpi/comms_mpi.hpp" #include -#include +#include +#include +#include "utilities/error_utils.h" namespace cugraph { namespace experimental { +#if USE_NCCL + +/**---------------------------------------------------------------------------* + * @brief Exception thrown when a NCCL error is encountered. + * + *---------------------------------------------------------------------------**/ +struct nccl_error : public std::runtime_error { + nccl_error(std::string const& message) : std::runtime_error(message) {} +}; + +inline void throw_nccl_error(ncclResult_t error, const char* file, + unsigned int line) { + throw nccl_error( + std::string{"NCCL error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + ncclGetErrorString(error)}); +} + +#define NCCL_TRY(call) { \ + ncclResult_t nccl_status = (call); \ + if (nccl_status!= ncclSuccess) { \ + throw_nccl_error(nccl_status, __FILE__, __LINE__); \ + } \ +} +// MPI errors are expected to be fatal before reaching this. +// Fix me : improve when adding raft comms +#define MPI_TRY(cmd) { \ + int e = cmd; \ + if ( e != MPI_SUCCESS ) { \ + CUGRAPH_FAIL("Failed: MPI error"); \ + } \ +} + +template +constexpr MPI_Datatype get_mpi_type() { + if (std::is_integral::value) { + if (std::is_signed::value) { + if (sizeof(value_t) == 1) { + return MPI_INT8_T; + } + else if (sizeof(value_t) == 2) { + return MPI_INT16_T; + } + else if (sizeof(value_t) == 4) { + return MPI_INT32_T; + } + else if (sizeof(value_t) == 8) { + return MPI_INT64_T; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + else { + if (sizeof(value_t) == 1) { + return MPI_UINT8_T; + } + else if (sizeof(value_t) == 2) { + return MPI_UINT16_T; + } + else if (sizeof(value_t) == 4) { + return MPI_UINT32_T; + } + else if (sizeof(value_t) == 8) { + return MPI_UINT64_T; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + } + else if(std::is_same::value) { + return MPI_FLOAT; + } + else if(std::is_same::value) { + return MPI_DOUBLE; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} + +template +constexpr ncclDataType_t get_nccl_type() { + if (std::is_integral::value) { + if (std::is_signed::value) { + if (sizeof(value_t) == 1) { + return ncclInt8; + } + else if (sizeof(value_t) == 4) { + return ncclInt32; + } + else if (sizeof(value_t) == 8) { + return ncclInt64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + else { + if (sizeof(value_t) == 1) { + return ncclUint8; + } + else if (sizeof(value_t) == 4) { + return ncclUint32; + } + else if (sizeof(value_t) == 8) { + return ncclUint64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } + } + } + else if(std::is_same::value) { + return ncclFloat32; + } + else if(std::is_same::value) { + return ncclFloat64; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} + +constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { + if (reduce_op == ReduceOp::SUM) { + return MPI_SUM; + } + else if (reduce_op == ReduceOp::MAX) { + return MPI_MAX; + } + else if (reduce_op == ReduceOp::MIN) { + return MPI_MIN; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} + +constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { + if (reduce_op == ReduceOp::SUM) { + return ncclSum; + } + else if (reduce_op == ReduceOp::MAX) { + return ncclMax; + } + else if (reduce_op == ReduceOp::MIN) { + return ncclMin; + } + else { + CUGRAPH_FAIL("unsupported type"); + } +} +#endif Comm::Comm(int p) : _p{p} { #if USE_NCCL @@ -90,4 +245,27 @@ void Comm::barrier() { MPI_Barrier(MPI_COMM_WORLD); #endif } + +template +void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const { +#if USE_NCCL + NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); +#endif +} + +template +void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const { +#if USE_NCCL + NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); +#endif +} + +//explicit +template void Comm::allgather(size_t size, int* sendbuff, int* recvbuff) const; +template void Comm::allgather(size_t size, float* sendbuff, float* recvbuff) const; +template void Comm::allgather(size_t size, double* sendbuff, double* recvbuff) const; +template void Comm::allreduce(size_t size, int* sendbuff, int* recvbuff, ReduceOp reduce_op) const; +template void Comm::allreduce(size_t size, float* sendbuff, float* recvbuff, ReduceOp reduce_op) const; +template void Comm::allreduce(size_t size, double* sendbuff, double* recvbuff, ReduceOp reduce_op) const; + } }//namespace diff --git a/cpp/src/comms/mpi/comms_mpi.hpp b/cpp/src/comms/mpi/comms_mpi.hpp deleted file mode 100644 index 9b1ca8c3126..00000000000 --- a/cpp/src/comms/mpi/comms_mpi.hpp +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -#pragma once - -#if USE_NCCL -#include -#include -#endif - -#include -#include -#include -#include "utilities/error_utils.h" - -namespace cugraph { -namespace experimental { - -enum class ReduceOp { SUM, MAX, MIN }; - -#if USE_NCCL -/**---------------------------------------------------------------------------* - * @brief Exception thrown when a NCCL error is encountered. - * - *---------------------------------------------------------------------------**/ -struct nccl_error : public std::runtime_error { - nccl_error(std::string const& message) : std::runtime_error(message) {} -}; - -inline void throw_nccl_error(ncclResult_t error, const char* file, - unsigned int line) { - throw nccl_error( - std::string{"NCCL error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + ncclGetErrorString(error)}); -} - -#define NCCL_TRY(call) { \ - ncclResult_t nccl_status = (call); \ - if (nccl_status!= ncclSuccess) { \ - throw_nccl_error(nccl_status, __FILE__, __LINE__); \ - } \ -} - -// MPI errors are expected to be fatal before reaching this. -// Fix me : improve when adding raft comms -#define MPI_TRY(cmd) { \ - int e = cmd; \ - if ( e != MPI_SUCCESS ) { \ - CUGRAPH_FAIL("Failed: MPI error"); \ - } \ -} - -template -constexpr MPI_Datatype get_mpi_type() { - if (std::is_integral::value) { - if (std::is_signed::value) { - if (sizeof(value_t) == 1) { - return MPI_INT8_T; - } - else if (sizeof(value_t) == 2) { - return MPI_INT16_T; - } - else if (sizeof(value_t) == 4) { - return MPI_INT32_T; - } - else if (sizeof(value_t) == 8) { - return MPI_INT64_T; - } - else { - CUGRAPH_FAIL("unsupported type"); - } - } - else { - if (sizeof(value_t) == 1) { - return MPI_UINT8_T; - } - else if (sizeof(value_t) == 2) { - return MPI_UINT16_T; - } - else if (sizeof(value_t) == 4) { - return MPI_UINT32_T; - } - else if (sizeof(value_t) == 8) { - return MPI_UINT64_T; - } - else { - CUGRAPH_FAIL("unsupported type"); - } - } - } - else if(std::is_same::value) { - return MPI_FLOAT; - } - else if(std::is_same::value) { - return MPI_DOUBLE; - } - else { - CUGRAPH_FAIL("unsupported type"); - } -} - -template -constexpr ncclDataType_t get_nccl_type() { - if (std::is_integral::value) { - if (std::is_signed::value) { - if (sizeof(value_t) == 1) { - return ncclInt8; - } - else if (sizeof(value_t) == 4) { - return ncclInt32; - } - else if (sizeof(value_t) == 8) { - return ncclInt64; - } - else { - CUGRAPH_FAIL("unsupported type"); - } - } - else { - if (sizeof(value_t) == 1) { - return ncclUint8; - } - else if (sizeof(value_t) == 4) { - return ncclUint32; - } - else if (sizeof(value_t) == 8) { - return ncclUint64; - } - else { - CUGRAPH_FAIL("unsupported type"); - } - } - } - else if(std::is_same::value) { - return ncclFloat32; - } - else if(std::is_same::value) { - return ncclFloat64; - } - else { - CUGRAPH_FAIL("unsupported type"); - } -} - -constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { - if (reduce_op == ReduceOp::SUM) { - return MPI_SUM; - } - else if (reduce_op == ReduceOp::MAX) { - return MPI_MAX; - } - else if (reduce_op == ReduceOp::MIN) { - return MPI_MIN; - } - else { - CUGRAPH_FAIL("unsupported type"); - } -} - -constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { - if (reduce_op == ReduceOp::SUM) { - return ncclSum; - } - else if (reduce_op == ReduceOp::MAX) { - return ncclMax; - } - else if (reduce_op == ReduceOp::MIN) { - return ncclMin; - } - else { - CUGRAPH_FAIL("unsupported type"); - } -} -#endif - -// basic info about the snmg env setup -class Comm -{ - private: - int _p{0}; - - int _mpi_world_rank{0}; - int _mpi_world_size{0}; - bool _finalize_mpi{false}; - bool _finalize_nccl{false}; - - - int _device_id{0}; - int _device_count{0}; - - std::vector _p_ipc_mems{}; - std::vector _local_ipc_mem_offsets{}; - - int _sm_count_per_device{0}; - int _max_grid_dim_1D{0}; - int _max_block_dim_1D{0}; - int _l2_cache_size{0}; - int _shared_memory_size_per_sm{0}; - -#if USE_NCCL - MPI_Comm _mpi_comm{}; - ncclComm_t _nccl_comm{}; - #endif - - public: - Comm(){}; - Comm(int p); - ~Comm(); - int get_rank() const { return _mpi_world_rank; } - int get_p() const { return _mpi_world_size; } - int get_dev() const { return _device_id; } - int get_dev_count() const { return _device_count; } - int get_sm_count() const { return _sm_count_per_device; } - bool is_master() const { return (_mpi_world_rank == 0)? true : false; } - - void barrier(); - - template - void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const; - - template - void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const; - -}; - -template -void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const { -#if USE_NCCL - NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); -#endif -} - -template -void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const { -#if USE_NCCL - NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); -#endif -} - -} } //namespace diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 1c7221076d4..828ccbcb94b 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -5,7 +5,7 @@ #include #include #include -#include "comms/mpi/comms_mpi.hpp" +#include // ref Degree on the host template From 1643eeebb34bdeec77ee13fbda27130d9ac2c37a Mon Sep 17 00:00:00 2001 From: afender Date: Fri, 1 May 2020 16:41:46 -0500 Subject: [PATCH 14/19] constructor for python and fixes --- cpp/include/comms_mpi.hpp | 14 +++++++------- cpp/src/comms/mpi/comms_mpi.cpp | 30 +++++++++++++++++++++++++----- cpp/src/structure/graph.cu | 4 ++-- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index c6cb5339f54..8019ebc809c 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -32,13 +32,10 @@ class Comm { private: int _p{0}; - - int _mpi_world_rank{0}; - int _mpi_world_size{0}; + int _rank{0}; bool _finalize_mpi{false}; bool _finalize_nccl{false}; - int _device_id{0}; int _device_count{0}; @@ -56,13 +53,16 @@ class Comm public: Comm(){}; Comm(int p); + #if USE_NCCL + Comm(ncclComm_t comm, int size, int rank); + #endif ~Comm(); - int get_rank() const { return _mpi_world_rank; } - int get_p() const { return _mpi_world_size; } + int get_rank() const { return _rank; } + int get_p() const { return _p; } int get_dev() const { return _device_id; } int get_dev_count() const { return _device_count; } int get_sm_count() const { return _sm_count_per_device; } - bool is_master() const { return (_mpi_world_rank == 0)? true : false; } + bool is_master() const { return (_rank == 0)? true : false; } void barrier(); diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 6a1846ee35b..5b7390abfc4 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -180,7 +180,7 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { Comm::Comm(int p) : _p{p} { #if USE_NCCL // MPI - int flag{}; + int flag{}, mpi_world_size; MPI_TRY(MPI_Initialized(&flag)); @@ -193,9 +193,9 @@ Comm::Comm(int p) : _p{p} { _finalize_mpi = true; } - MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_mpi_world_rank)); - MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &_mpi_world_size)); - CUGRAPH_EXPECTS( (_p == _mpi_world_size), + MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank)); + MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size)); + CUGRAPH_EXPECTS( (_p == mpi_world_size), "Invalid input arguments: p should match the number of MPI processes."); _mpi_comm = MPI_COMM_WORLD; @@ -203,7 +203,7 @@ Comm::Comm(int p) : _p{p} { // CUDA CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = _mpi_world_rank % _device_count; + _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs CUDA_TRY(cudaSetDevice(_device_id)); CUDA_TRY( @@ -228,6 +228,26 @@ Comm::Comm(int p) : _p{p} { } +#if USE_NCCL +Comm::Comm(ncclComm_t comm, int size, int rank) + : _nccl_comm(comm), _p(size), _rank(rank) { + + // CUDA + CUDA_TRY(cudaGetDeviceCount(&_device_count)); + _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs + CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if python takes care of this + + CUDA_TRY( + cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); + CUDA_TRY( + cudaDeviceGetAttribute( + &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); +} +#endif + Comm::~Comm() { #if USE_NCCL // NCCL diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index d0ade029462..26a67275d19 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -78,7 +78,7 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()); // FixMe retrieve global source indexing for the allreduce work + if (GraphBase::comm.get_p()) // FixMe retrieve global source indexing for the allreduce work CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); } @@ -99,7 +99,7 @@ void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection dir cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()); + if (GraphBase::comm.get_p()) CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work degree_from_offsets(GraphBase::number_of_vertices, offsets, degree, stream); } From 7ac19b1a1f17f71e9ff884ea2882ca3956045fb8 Mon Sep 17 00:00:00 2001 From: afender Date: Fri, 1 May 2020 16:52:42 -0500 Subject: [PATCH 15/19] naming --- cpp/CMakeLists.txt | 2 +- cpp/include/comms_mpi.hpp | 6 +++--- cpp/src/comms/mpi/comms_mpi.cpp | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6031dc8ccef..d1d56e94318 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -399,7 +399,7 @@ add_library(cugraph SHARED add_dependencies(cugraph cugunrock) if (BUILD_MPI) - add_compile_definitions(USE_NCCL=1) + add_compile_definitions(ENABLE_OPG=1) endif (BUILD_MPI) ################################################################################################### diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index 8019ebc809c..27944aea103 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -17,7 +17,7 @@ #pragma once -#if USE_NCCL +#if ENABLE_OPG #include #include #endif @@ -45,7 +45,7 @@ class Comm int _l2_cache_size{0}; int _shared_memory_size_per_sm{0}; -#if USE_NCCL +#if ENABLE_OPG MPI_Comm _mpi_comm{}; ncclComm_t _nccl_comm{}; #endif @@ -53,7 +53,7 @@ class Comm public: Comm(){}; Comm(int p); - #if USE_NCCL + #if ENABLE_OPG Comm(ncclComm_t comm, int size, int rank); #endif ~Comm(); diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 5b7390abfc4..b2fdda1a00c 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -22,7 +22,7 @@ namespace cugraph { namespace experimental { -#if USE_NCCL +#if ENABLE_OPG /**---------------------------------------------------------------------------* * @brief Exception thrown when a NCCL error is encountered. @@ -178,7 +178,7 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { #endif Comm::Comm(int p) : _p{p} { -#if USE_NCCL +#if ENABLE_OPG // MPI int flag{}, mpi_world_size; @@ -228,7 +228,7 @@ Comm::Comm(int p) : _p{p} { } -#if USE_NCCL +#if ENABLE_OPG Comm::Comm(ncclComm_t comm, int size, int rank) : _nccl_comm(comm), _p(size), _rank(rank) { @@ -249,7 +249,7 @@ Comm::Comm(ncclComm_t comm, int size, int rank) #endif Comm::~Comm() { -#if USE_NCCL +#if ENABLE_OPG // NCCL if (_finalize_nccl) ncclCommDestroy(_nccl_comm); @@ -261,21 +261,21 @@ Comm::~Comm() { } void Comm::barrier() { -#if USE_NCCL +#if ENABLE_OPG MPI_Barrier(MPI_COMM_WORLD); #endif } template void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const { -#if USE_NCCL +#if ENABLE_OPG NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); #endif } template void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const { -#if USE_NCCL +#if ENABLE_OPG NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); #endif } From 5b45a5c582acff7465e9c8d03e7df293d3856264 Mon Sep 17 00:00:00 2001 From: afender Date: Tue, 5 May 2020 15:20:07 -0500 Subject: [PATCH 16/19] Clang formating --- cpp/include/comms_mpi.hpp | 78 +++++------ cpp/include/graph.hpp | 214 +++++++++++++++-------------- cpp/src/comms/mpi/comms_mpi.cpp | 229 +++++++++++++++---------------- cpp/src/structure/graph.cu | 87 ++++++------ cpp/src/utilities/cuda_utils.cuh | 58 ++++---- cpp/tests/nccl/degree_test.cu | 94 +++++++------ 6 files changed, 390 insertions(+), 370 deletions(-) diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index 27944aea103..68fbf4f27cc 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -14,64 +14,62 @@ * limitations under the License. */ - #pragma once - #if ENABLE_OPG #include #include #endif -namespace cugraph { +namespace cugraph { namespace experimental { enum class ReduceOp { SUM, MAX, MIN }; // basic info about the snmg env setup -class Comm -{ - private: - int _p{0}; - int _rank{0}; - bool _finalize_mpi{false}; - bool _finalize_nccl{false}; +class Comm { +private: + int _p{0}; + int _rank{0}; + bool _finalize_mpi{false}; + bool _finalize_nccl{false}; - int _device_id{0}; - int _device_count{0}; + int _device_id{0}; + int _device_count{0}; - int _sm_count_per_device{0}; - int _max_grid_dim_1D{0}; - int _max_block_dim_1D{0}; - int _l2_cache_size{0}; - int _shared_memory_size_per_sm{0}; + int _sm_count_per_device{0}; + int _max_grid_dim_1D{0}; + int _max_block_dim_1D{0}; + int _l2_cache_size{0}; + int _shared_memory_size_per_sm{0}; #if ENABLE_OPG - MPI_Comm _mpi_comm{}; - ncclComm_t _nccl_comm{}; - #endif - - public: - Comm(){}; - Comm(int p); - #if ENABLE_OPG - Comm(ncclComm_t comm, int size, int rank); - #endif - ~Comm(); - int get_rank() const { return _rank; } - int get_p() const { return _p; } - int get_dev() const { return _device_id; } - int get_dev_count() const { return _device_count; } - int get_sm_count() const { return _sm_count_per_device; } - bool is_master() const { return (_rank == 0)? true : false; } + MPI_Comm _mpi_comm{}; + ncclComm_t _nccl_comm{}; +#endif - void barrier(); +public: + Comm(){}; + Comm(int p); +#if ENABLE_OPG + Comm(ncclComm_t comm, int size, int rank); +#endif + ~Comm(); + int get_rank() const { return _rank; } + int get_p() const { return _p; } + int get_dev() const { return _device_id; } + int get_dev_count() const { return _device_count; } + int get_sm_count() const { return _sm_count_per_device; } + bool is_master() const { return (_rank == 0) ? true : false; } - template - void allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const; + void barrier(); - template - void allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const; + template + void allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const; + template + void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, + ReduceOp reduce_op) const; }; -} } //namespace +} // namespace experimental +} // namespace cugraph diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index aac5e9116a1..1c11f17311a 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -18,7 +18,7 @@ namespace cugraph { namespace experimental { -enum class PropType{PROP_UNDEF, PROP_FALSE, PROP_TRUE}; +enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; struct GraphProperties { bool directed{false}; @@ -31,10 +31,10 @@ struct GraphProperties { }; enum class DegreeDirection { - IN_PLUS_OUT = 0, ///> Compute sum of in and out degree - IN, ///> Compute in degree - OUT, ///> Compute out degree - DEGREE_DIRECTION_COUNT + IN_PLUS_OUT = 0, ///> Compute sum of in and out degree + IN, ///> Compute in degree + OUT, ///> Compute out degree + DEGREE_DIRECTION_COUNT }; /** @@ -44,32 +44,29 @@ enum class DegreeDirection { * @tparam ET Type of edge id * @tparam WT Type of weight */ -template -class GraphBase { +template class GraphBase { public: Comm comm; - WT *edge_data; ///< edge weight - GraphProperties prop; + WT *edge_data; ///< edge weight + GraphProperties prop; - VT number_of_vertices; - ET number_of_edges; + VT number_of_vertices; + ET number_of_edges; /** * @brief Fill the identifiers array with the vertex identifiers. * - * @param[out] identifier Pointer to device memory to store the vertex identifiers + * @param[out] identifier Pointer to device memory to store the vertex + * identifiers */ void get_vertex_identifiers(VT *identifiers) const; - void set_communicator(Comm& comm_) {comm = comm_;} - - GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_): - edge_data(edge_data_), - comm(), - prop(), - number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) - {} + void set_communicator(Comm &comm_) { comm = comm_; } + + GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : edge_data(edge_data_), comm(), prop(), + number_of_vertices(number_of_vertices_), + number_of_edges(number_of_edges_) {} }; /** @@ -80,66 +77,73 @@ class GraphBase { * @tparam WT Type of weight */ template -class GraphCOO: public GraphBase { +class GraphCOO : public GraphBase { public: - VT *src_indices{nullptr}; ///< rowInd - VT *dst_indices{nullptr}; ///< colInd + VT *src_indices{nullptr}; ///< rowInd + VT *dst_indices{nullptr}; ///< colInd /** * @brief Computes degree(in, out, in+out) of all the nodes of a Graph * * @throws cugraph::logic_error when an error occurs. * - * @param[out] degree Device array of size V (V is number of vertices) initialized to zeros. - * Will contain the computed degree of every vertex. + * @param[out] degree Device array of size V (V is number of + * vertices) initialized to zeros. Will contain the computed degree of every + * vertex. * @param[in] direction IN_PLUS_OUT, IN or OUT */ void degree(ET *degree, DegreeDirection direction) const; - + /** * @brief Default constructor */ - GraphCOO(): GraphBase(nullptr, 0, 0) {} - + GraphCOO() : GraphBase(nullptr, 0, 0) {} + /** * @brief Wrap existing arrays representing an edge list in a Graph. * - * GraphCOO does not own the memory used to represent this graph. This - * function does not allocate memory. + * GraphCOO does not own the memory used to represent this graph. + * This function does not allocate memory. * - * @param source_indices This array of size E (number of edges) contains the index of the source for each edge. - * Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the destination for each edge. - * Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each edge. This array can be null - * in which case the graph is considered unweighted. + * @param source_indices This array of size E (number of edges) + * contains the index of the source for each edge. Indices must be in the + * range [0, V-1]. + * @param destination_indices This array of size E (number of edges) + * contains the index of the destination for each edge. Indices must be in the + * range [0, V-1]. + * @param edge_data This array size E (number of edges) contains + * the weight for each edge. This array can be null in which case the graph + * is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ GraphCOO(VT *src_indices_, VT *dst_indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_): - GraphBase(edge_data_, number_of_vertices_, number_of_edges_), - src_indices(src_indices_), dst_indices(dst_indices_) - {} + VT number_of_vertices_, ET number_of_edges_) + : GraphBase(edge_data_, number_of_vertices_, + number_of_edges_), + src_indices(src_indices_), dst_indices(dst_indices_) {} }; /** - * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed Sparse Column) format + * @brief Base class for graph stored in CSR (Compressed Sparse Row) + * format or CSC (Compressed Sparse Column) format * * @tparam VT Type of vertex id * @tparam ET Type of edge id * @tparam WT Type of weight */ template -class GraphCompressedSparseBase: public GraphBase { +class GraphCompressedSparseBase : public GraphBase { public: - ET *offsets{nullptr}; ///< CSR offsets - VT *indices{nullptr}; ///< CSR indices + ET *offsets{nullptr}; ///< CSR offsets + VT *indices{nullptr}; ///< CSR indices /** - * @brief Fill the identifiers in the array with the source vertex identifiers + * @brief Fill the identifiers in the array with the source vertex + * identifiers * - * @param[out] src_indices Pointer to device memory to store the source vertex identifiers + * @param[out] src_indices Pointer to device memory to store the + * source vertex identifiers */ void get_source_indices(VT *src_indices) const; @@ -148,35 +152,35 @@ class GraphCompressedSparseBase: public GraphBase { * * @throws cugraph::logic_error when an error occurs. * - * @param[out] degree Device array of size V (V is number of vertices) initialized to zeros. - * Will contain the computed degree of every vertex. - * @param[in] x Integer value indicating type of degree calculation - * 0 : in+out degree - * 1 : in-degree - * 2 : out-degree + * @param[out] degree Device array of size V (V is number of + * vertices) initialized to zeros. Will contain the computed degree of every + * vertex. + * @param[in] x Integer value indicating type of degree + * calculation 0 : in+out degree 1 : in-degree 2 : out-degree */ void degree(ET *degree, DegreeDirection direction) const; - + /** * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSR does not own the memory used to represent this graph. This - * function does not allocate memory. + * GraphCSR does not own the memory used to represent this graph. + * This function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex. - * Offsets must be in the range [0, E] (number of edges). - * @param indices This array of size E contains the index of the destination for each edge. - * Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for each edge. This - * array can be null in which case the graph is considered unweighted. + * @param offsets This array of size V+1 (V is number of + * vertices) contains the offset of adjacency lists of every vertex. Offsets + * must be in the range [0, E] (number of edges). + * @param indices This array of size E contains the index of + * the destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) + * contains the weight for each edge. This array can be null in which case + * the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ GraphCompressedSparseBase(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_): - GraphBase(edge_data_, number_of_vertices_, number_of_edges_), - offsets{offsets_}, - indices{indices_} - {} + VT number_of_vertices_, ET number_of_edges_) + : GraphBase(edge_data_, number_of_vertices_, + number_of_edges_), + offsets{offsets_}, indices{indices_} {} }; /** @@ -187,31 +191,36 @@ class GraphCompressedSparseBase: public GraphBase { * @tparam WT Type of weight */ template -class GraphCSR: public GraphCompressedSparseBase { +class GraphCSR : public GraphCompressedSparseBase { public: /** * @brief Default constructor */ - GraphCSR(): GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) {} - + GraphCSR() + : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) { + } + /** * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSR does not own the memory used to represent this graph. This - * function does not allocate memory. + * GraphCSR does not own the memory used to represent this graph. + * This function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex. - * Offsets must be in the range [0, E] (number of edges). - * @param indices This array of size E contains the index of the destination for each edge. - * Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for each edge. This - * array can be null in which case the graph is considered unweighted. + * @param offsets This array of size V+1 (V is number of + * vertices) contains the offset of adjacency lists of every vertex. Offsets + * must be in the range [0, E] (number of edges). + * @param indices This array of size E contains the index of + * the destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) + * contains the weight for each edge. This array can be null in which case + * the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_): - GraphCompressedSparseBase(offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - {} + GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, + ET number_of_edges_) + : GraphCompressedSparseBase(offsets_, indices_, edge_data_, + number_of_vertices_, + number_of_edges_) {} }; /** @@ -222,32 +231,37 @@ class GraphCSR: public GraphCompressedSparseBase { * @tparam WT Type of weight */ template -class GraphCSC: public GraphCompressedSparseBase { +class GraphCSC : public GraphCompressedSparseBase { public: /** * @brief Default constructor */ - GraphCSC(): GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) {} - + GraphCSC() + : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) { + } + /** - * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. - * GraphCSC does not own the memory used to represent this graph. This - * function does not allocate memory. + * @brief Wrap existing arrays representing transposed adjacency lists in + * a Graph. GraphCSC does not own the memory used to represent this graph. + * This function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the offset of adjacency lists of every vertex. - * Offsets must be in the range [0, E] (number of edges). - * @param indices This array of size E contains the index of the destination for each edge. - * Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for each edge. This array - * can be null in which case the graph is considered unweighted. + * @param offsets This array of size V+1 (V is number of + * vertices) contains the offset of adjacency lists of every vertex. Offsets + * must be in the range [0, E] (number of edges). + * @param indices This array of size E contains the index of + * the destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) + * contains the weight for each edge. This array can be null in which case + * the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_): - GraphCompressedSparseBase(offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - {} + GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, + ET number_of_edges_) + : GraphCompressedSparseBase(offsets_, indices_, edge_data_, + number_of_vertices_, + number_of_edges_) {} }; -} //namespace experimental -} //namespace cugraph +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index b2fdda1a00c..2d761d2e2dc 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -14,13 +14,12 @@ * limitations under the License. */ - -#include +#include "utilities/error_utils.h" #include +#include #include -#include "utilities/error_utils.h" -namespace cugraph { +namespace cugraph { namespace experimental { #if ENABLE_OPG @@ -29,119 +28,97 @@ namespace experimental { * *---------------------------------------------------------------------------**/ struct nccl_error : public std::runtime_error { - nccl_error(std::string const& message) : std::runtime_error(message) {} + nccl_error(std::string const &message) : std::runtime_error(message) {} }; -inline void throw_nccl_error(ncclResult_t error, const char* file, +inline void throw_nccl_error(ncclResult_t error, const char *file, unsigned int line) { throw nccl_error( std::string{"NCCL error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + ncclGetErrorString(error)}); } -#define NCCL_TRY(call) { \ - ncclResult_t nccl_status = (call); \ - if (nccl_status!= ncclSuccess) { \ - throw_nccl_error(nccl_status, __FILE__, __LINE__); \ - } \ -} +#define NCCL_TRY(call) \ + { \ + ncclResult_t nccl_status = (call); \ + if (nccl_status != ncclSuccess) { \ + throw_nccl_error(nccl_status, __FILE__, __LINE__); \ + } \ + } // MPI errors are expected to be fatal before reaching this. // Fix me : improve when adding raft comms -#define MPI_TRY(cmd) { \ - int e = cmd; \ - if ( e != MPI_SUCCESS ) { \ - CUGRAPH_FAIL("Failed: MPI error"); \ - } \ -} +#define MPI_TRY(cmd) \ + { \ + int e = cmd; \ + if (e != MPI_SUCCESS) { \ + CUGRAPH_FAIL("Failed: MPI error"); \ + } \ + } -template -constexpr MPI_Datatype get_mpi_type() { +template constexpr MPI_Datatype get_mpi_type() { if (std::is_integral::value) { if (std::is_signed::value) { if (sizeof(value_t) == 1) { return MPI_INT8_T; - } - else if (sizeof(value_t) == 2) { + } else if (sizeof(value_t) == 2) { return MPI_INT16_T; - } - else if (sizeof(value_t) == 4) { + } else if (sizeof(value_t) == 4) { return MPI_INT32_T; - } - else if (sizeof(value_t) == 8) { + } else if (sizeof(value_t) == 8) { return MPI_INT64_T; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } - } - else { + } else { if (sizeof(value_t) == 1) { return MPI_UINT8_T; - } - else if (sizeof(value_t) == 2) { + } else if (sizeof(value_t) == 2) { return MPI_UINT16_T; - } - else if (sizeof(value_t) == 4) { + } else if (sizeof(value_t) == 4) { return MPI_UINT32_T; - } - else if (sizeof(value_t) == 8) { + } else if (sizeof(value_t) == 8) { return MPI_UINT64_T; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } - } - else if(std::is_same::value) { + } else if (std::is_same::value) { return MPI_FLOAT; - } - else if(std::is_same::value) { + } else if (std::is_same::value) { return MPI_DOUBLE; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } -template -constexpr ncclDataType_t get_nccl_type() { +template constexpr ncclDataType_t get_nccl_type() { if (std::is_integral::value) { if (std::is_signed::value) { if (sizeof(value_t) == 1) { return ncclInt8; - } - else if (sizeof(value_t) == 4) { + } else if (sizeof(value_t) == 4) { return ncclInt32; - } - else if (sizeof(value_t) == 8) { + } else if (sizeof(value_t) == 8) { return ncclInt64; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } - } - else { + } else { if (sizeof(value_t) == 1) { return ncclUint8; - } - else if (sizeof(value_t) == 4) { + } else if (sizeof(value_t) == 4) { return ncclUint32; - } - else if (sizeof(value_t) == 8) { + } else if (sizeof(value_t) == 8) { return ncclUint64; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } - } - else if(std::is_same::value) { + } else if (std::is_same::value) { return ncclFloat32; - } - else if(std::is_same::value) { + } else if (std::is_same::value) { return ncclFloat64; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } @@ -149,14 +126,11 @@ constexpr ncclDataType_t get_nccl_type() { constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { if (reduce_op == ReduceOp::SUM) { return MPI_SUM; - } - else if (reduce_op == ReduceOp::MAX) { + } else if (reduce_op == ReduceOp::MAX) { return MPI_MAX; - } - else if (reduce_op == ReduceOp::MIN) { + } else if (reduce_op == ReduceOp::MIN) { return MPI_MIN; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } @@ -164,14 +138,11 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { if (reduce_op == ReduceOp::SUM) { return ncclSum; - } - else if (reduce_op == ReduceOp::MAX) { + } else if (reduce_op == ReduceOp::MAX) { return ncclMax; - } - else if (reduce_op == ReduceOp::MIN) { + } else if (reduce_op == ReduceOp::MIN) { return ncclMin; - } - else { + } else { CUGRAPH_FAIL("unsupported type"); } } @@ -195,25 +166,31 @@ Comm::Comm(int p) : _p{p} { MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank)); MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size)); - CUGRAPH_EXPECTS( (_p == mpi_world_size), - "Invalid input arguments: p should match the number of MPI processes."); + CUGRAPH_EXPECTS( + (_p == mpi_world_size), + "Invalid input arguments: p should match the number of MPI processes."); _mpi_comm = MPI_COMM_WORLD; // CUDA CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs + _device_id = + _rank % + _device_count; // FixMe : assumes each node has the same number of GPUs CUDA_TRY(cudaSetDevice(_device_id)); - CUDA_TRY( - cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); - CUDA_TRY( - cudaDeviceGetAttribute( - &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, + cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + _device_id)); // NCCL @@ -221,30 +198,37 @@ Comm::Comm(int p) : _p{p} { if (get_rank() == 0) { NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); } - MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm)); - NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); + MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, + _mpi_comm)); + NCCL_TRY( + ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); _finalize_nccl = true; #endif - } #if ENABLE_OPG Comm::Comm(ncclComm_t comm, int size, int rank) - : _nccl_comm(comm), _p(size), _rank(rank) { + : _nccl_comm(comm), _p(size), _rank(rank) { // CUDA CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = _rank % _device_count; // FixMe : assumes each node has the same number of GPUs - CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if python takes care of this + _device_id = + _rank % + _device_count; // FixMe : assumes each node has the same number of GPUs + CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if + // python takes care of this - CUDA_TRY( - cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); - CUDA_TRY( - cudaDeviceGetAttribute( - &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, + cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, + _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + _device_id)); } #endif @@ -267,25 +251,38 @@ void Comm::barrier() { } template -void Comm::allgather (size_t size, value_t* sendbuff, value_t* recvbuff) const { +void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const { #if ENABLE_OPG - NCCL_TRY(ncclAllGather((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), _nccl_comm, cudaStreamDefault)); + NCCL_TRY(ncclAllGather((const void *)sendbuff, (void *)recvbuff, size, + get_nccl_type(), _nccl_comm, + cudaStreamDefault)); #endif } template -void Comm::allreduce (size_t size, value_t* sendbuff, value_t* recvbuff, ReduceOp reduce_op) const { +void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, + ReduceOp reduce_op) const { #if ENABLE_OPG - NCCL_TRY(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, get_nccl_type(), get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); + NCCL_TRY(ncclAllReduce( + (const void *)sendbuff, (void *)recvbuff, size, get_nccl_type(), + get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); #endif } -//explicit -template void Comm::allgather(size_t size, int* sendbuff, int* recvbuff) const; -template void Comm::allgather(size_t size, float* sendbuff, float* recvbuff) const; -template void Comm::allgather(size_t size, double* sendbuff, double* recvbuff) const; -template void Comm::allreduce(size_t size, int* sendbuff, int* recvbuff, ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, float* sendbuff, float* recvbuff, ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, double* sendbuff, double* recvbuff, ReduceOp reduce_op) const; - -} }//namespace +// explicit +template void Comm::allgather(size_t size, int *sendbuff, + int *recvbuff) const; +template void Comm::allgather(size_t size, float *sendbuff, + float *recvbuff) const; +template void Comm::allgather(size_t size, double *sendbuff, + double *recvbuff) const; +template void Comm::allreduce(size_t size, int *sendbuff, int *recvbuff, + ReduceOp reduce_op) const; +template void Comm::allreduce(size_t size, float *sendbuff, + float *recvbuff, ReduceOp reduce_op) const; +template void Comm::allreduce(size_t size, double *sendbuff, + double *recvbuff, + ReduceOp reduce_op) const; + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 26a67275d19..9c5b0e1a77e 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -1,4 +1,4 @@ - /* +/* * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property @@ -9,66 +9,63 @@ * */ -#include -#include "utilities/graph_utils.cuh" -#include "utilities/error_utils.h" #include "utilities/cuda_utils.cuh" +#include "utilities/error_utils.h" +#include "utilities/graph_utils.cuh" - +#include namespace { template -void degree_from_offsets(vertex_t number_of_vertices, - edge_t const *offsets, - edge_t *degree, - cudaStream_t stream) { +void degree_from_offsets(vertex_t number_of_vertices, edge_t const *offsets, + edge_t *degree, cudaStream_t stream) { // Computes out-degree for x = 0 and x = 2 thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(number_of_vertices), - [offsets, degree] __device__ (vertex_t v) { - degree[v] = offsets[v+1]-offsets[v]; + [offsets, degree] __device__(vertex_t v) { + degree[v] = offsets[v + 1] - offsets[v]; }); } template -void degree_from_vertex_ids(const cugraph::experimental::Comm& comm, - vertex_t number_of_vertices, - edge_t number_of_edges, - vertex_t const *indices, - edge_t *degree, +void degree_from_vertex_ids(const cugraph::experimental::Comm &comm, + vertex_t number_of_vertices, edge_t number_of_edges, + vertex_t const *indices, edge_t *degree, cudaStream_t stream) { thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(number_of_edges), - [indices, degree] __device__ (edge_t e) { + [indices, degree] __device__(edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); - comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM); + comm.allreduce(number_of_vertices, degree, degree, + cugraph::experimental::ReduceOp::SUM); } -} //namespace anonymous +} // namespace namespace cugraph { namespace experimental { - template -void GraphBase::get_vertex_identifiers(VT *identifiers) const { +void GraphBase::get_vertex_identifiers(VT *identifiers) const { cugraph::detail::sequence(number_of_vertices, identifiers); } template -void GraphCompressedSparseBase::get_source_indices(VT *src_indices) const { - CUGRAPH_EXPECTS( offsets != nullptr , "No graph specified"); - cugraph::detail::offsets_to_indices(offsets, GraphBase::number_of_vertices, src_indices); +void GraphCompressedSparseBase::get_source_indices( + VT *src_indices) const { + CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified"); + cugraph::detail::offsets_to_indices( + offsets, GraphBase::number_of_vertices, src_indices); } template -void GraphCOO::degree(ET *degree, DegreeDirection direction) const { +void GraphCOO::degree(ET *degree, DegreeDirection direction) const { // // NOTE: We assume offsets/indices are a CSR. If a CSC is passed // in then x should be modified to reflect the expected direction. @@ -78,18 +75,24 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()) // FixMe retrieve global source indexing for the allreduce work + if (GraphBase::comm.get_p()) // FixMe retrieve global source + // indexing for the allreduce work CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); - degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, src_indices, degree, stream); + degree_from_vertex_ids( + GraphBase::comm, GraphBase::number_of_vertices, + GraphBase::number_of_edges, src_indices, degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, dst_indices, degree, stream); + degree_from_vertex_ids( + GraphBase::comm, GraphBase::number_of_vertices, + GraphBase::number_of_edges, dst_indices, degree, stream); } } template -void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection direction) const { +void GraphCompressedSparseBase::degree( + ET *degree, DegreeDirection direction) const { // // NOTE: We assume offsets/indices are a CSR. If a CSC is passed // in then x should be modified to reflect the expected direction. @@ -99,22 +102,28 @@ void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection dir cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()) - CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FixMe retrieve global source indexing for the allreduce to work - degree_from_offsets(GraphBase::number_of_vertices, offsets, degree, stream); + if (GraphBase::comm.get_p()) + CUGRAPH_FAIL( + "OPG degree not implemented for OUT degree"); // FixMe retrieve global + // source indexing for + // the allreduce to work + degree_from_offsets(GraphBase::number_of_vertices, offsets, + degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphBase::comm, GraphBase::number_of_vertices, GraphBase::number_of_edges, indices, degree, stream); + degree_from_vertex_ids( + GraphBase::comm, GraphBase::number_of_vertices, + GraphBase::number_of_edges, indices, degree, stream); } } // explicit instantiation template class GraphBase; template class GraphBase; -template class GraphCOO; -template class GraphCOO; -template class GraphCompressedSparseBase; -template class GraphCompressedSparseBase; -} -} +template class GraphCOO; +template class GraphCOO; +template class GraphCompressedSparseBase; +template class GraphCompressedSparseBase; +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/utilities/cuda_utils.cuh b/cpp/src/utilities/cuda_utils.cuh index fe581af914d..9dbf4568c97 100644 --- a/cpp/src/utilities/cuda_utils.cuh +++ b/cpp/src/utilities/cuda_utils.cuh @@ -15,50 +15,54 @@ */ #pragma once +#include + namespace cugraph { // // This should go into RAFT... // -__device__ static __forceinline__ int64_t atomicMin(int64_t* addr, int64_t val) { - unsigned long long *addr_as_ull{reinterpret_cast(addr)}; - unsigned long long *val_addr_as_ull{reinterpret_cast(&val)}; - unsigned long long old = *addr_as_ull; - unsigned long long val_as_ull = *val_addr_as_ull; - int64_t *p_old{reinterpret_cast(&old)}; - unsigned long long expected; +__device__ static __forceinline__ int64_t atomicMin(int64_t *addr, + int64_t val) { + unsigned long long *addr_as_ull{reinterpret_cast(addr)}; + unsigned long long *val_addr_as_ull{ + reinterpret_cast(&val)}; + unsigned long long old = *addr_as_ull; + unsigned long long val_as_ull = *val_addr_as_ull; + int64_t *p_old{reinterpret_cast(&old)}; + unsigned long long expected; do { - expected = old; - old = ::atomicCAS(addr_as_ull, - expected, - thrust::min(val_as_ull, expected)); - } while (expected != old); + expected = old; + old = ::atomicCAS(addr_as_ull, expected, thrust::min(val_as_ull, expected)); + } while (expected != old); return *p_old; } -__device__ static __forceinline__ int32_t atomicMin(int32_t* addr, int32_t val) { +__device__ static __forceinline__ int32_t atomicMin(int32_t *addr, + int32_t val) { return ::atomicMin(addr, val); } -__device__ static __forceinline__ int64_t atomicAdd(int64_t* addr, int64_t val) { - unsigned long long *addr_as_ull{reinterpret_cast(addr)}; - unsigned long long *val_addr_as_ull{reinterpret_cast(&val)}; - unsigned long long old = *addr_as_ull; - unsigned long long val_as_ull = *val_addr_as_ull; - int64_t *p_old{reinterpret_cast(&old)}; - unsigned long long expected; +__device__ static __forceinline__ int64_t atomicAdd(int64_t *addr, + int64_t val) { + unsigned long long *addr_as_ull{reinterpret_cast(addr)}; + unsigned long long *val_addr_as_ull{ + reinterpret_cast(&val)}; + unsigned long long old = *addr_as_ull; + unsigned long long val_as_ull = *val_addr_as_ull; + int64_t *p_old{reinterpret_cast(&old)}; + unsigned long long expected; do { - expected = old; - old = ::atomicCAS(addr_as_ull, - expected, - (expected + val_as_ull)); - } while (expected != old); + expected = old; + old = ::atomicCAS(addr_as_ull, expected, (expected + val_as_ull)); + } while (expected != old); return *p_old; } -__device__ static __forceinline__ int32_t atomicAdd(int32_t* addr, int32_t val) { +__device__ static __forceinline__ int32_t atomicAdd(int32_t *addr, + int32_t val) { return ::atomicAdd(addr, val); } -} //namespace cugraph +} // namespace cugraph diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 828ccbcb94b..5041cf94528 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -1,16 +1,15 @@ +#include "test_utils.h" #include "gtest/gtest.h" +#include #include -#include "test_utils.h" +#include #include #include #include -#include -#include // ref Degree on the host -template -void ref_degree_h(std::vector & ind_h, - std::vector & degree) { +template +void ref_degree_h(std::vector &ind_h, std::vector °ree) { for (size_t i = 0; i < degree.size(); i++) degree[i] = 0; for (size_t i = 0; i < ind_h.size(); i++) @@ -18,8 +17,7 @@ void ref_degree_h(std::vector & ind_h, } // global to local offsets by shifting all offsets by the first offset value -template -void shift_by_front(std::vector & v) { +template void shift_by_front(std::vector &v) { auto start = v.front(); for (auto i = size_t{0}; i < v.size(); ++i) v[i] -= start; @@ -27,16 +25,17 @@ void shift_by_front(std::vector & v) { // 1D partitioning such as each GPU has about the same number of edges template -void opg_edge_partioning(int r, int p, std::vector & ind_h, std::vector & part_offset, size_t & e_loc) { +void opg_edge_partioning(int r, int p, std::vector &ind_h, + std::vector &part_offset, size_t &e_loc) { - //set first and last partition offsets + // set first and last partition offsets part_offset[0] = 0; part_offset[p] = ind_h.size(); - //part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end())); - auto loc_nnz = ind_h.size()/p; - for (int i=1; i= start_nnz) { @@ -46,20 +45,17 @@ void opg_edge_partioning(int r, int p, std::vector & ind_h, std::vector src_h= {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, - dest_h={1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1}; + // host + std::vector src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, + dest_h = {1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1}; std::vector degree_h(v, 0.0), degree_ref(v, 0.0); - - - //MG + // MG int p; MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p)); cugraph::experimental::Comm comm(p); @@ -68,49 +64,51 @@ TEST(degree, success) size_t e_loc; opg_edge_partioning(i, p, src_h, part_offset, e_loc); - #ifdef OPG_VERBOSE +#ifdef OPG_VERBOSE sleep(i); for (auto j = part_offset.begin(); j != part_offset.end(); ++j) - std::cout << *j << ' '; + std::cout << *j << ' '; std::cout << std::endl; - std::cout<< "eloc: "<< e_loc < src_loc_h(src_h.begin()+part_offset[i], src_h.begin()+part_offset[i]+e_loc), - dest_loc_h(dest_h.begin()+part_offset[i], dest_h.begin()+part_offset[i]+e_loc); + std::cout << "eloc: " << e_loc << std::endl; +#endif + std::vector src_loc_h(src_h.begin() + part_offset[i], + src_h.begin() + part_offset[i] + e_loc), + dest_loc_h(dest_h.begin() + part_offset[i], + dest_h.begin() + part_offset[i] + e_loc); shift_by_front(src_loc_h); - // print mg info - printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev()); + printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), + comm.get_dev()); - //local device + // local device thrust::device_vector src_d(src_loc_h.begin(), src_loc_h.end()); thrust::device_vector dest_d(dest_loc_h.begin(), dest_loc_h.end()); thrust::device_vector degree_d(v); // load local chunck to cugraph - cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), - thrust::raw_pointer_cast(dest_d.data()), - nullptr, degree_h.size(), e_loc); + cugraph::experimental::GraphCOO G( + thrust::raw_pointer_cast(src_d.data()), + thrust::raw_pointer_cast(dest_d.data()), nullptr, degree_h.size(), e_loc); G.set_communicator(comm); // OUT degree - G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); + G.degree(thrust::raw_pointer_cast(degree_d.data()), + cugraph::experimental::DegreeDirection::IN); thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); ref_degree_h(dest_h, degree_ref); - //sleep(i); + // sleep(i); for (size_t j = 0; j < degree_h.size(); ++j) EXPECT_EQ(degree_ref[j], degree_h[j]); - std::cout<< "Rank "<< i << " done checking." < Date: Tue, 5 May 2020 15:38:44 -0500 Subject: [PATCH 17/19] fixmes and copyright --- cpp/include/comms_mpi.hpp | 2 +- cpp/src/comms/mpi/comms_mpi.cpp | 10 +++++----- cpp/src/structure/graph.cu | 21 +++++++++++++-------- cpp/tests/nccl/degree_test.cu | 17 +++++++++++++++++ 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index 68fbf4f27cc..dd32041f2f4 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index 2d761d2e2dc..f02d207789b 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ inline void throw_nccl_error(ncclResult_t error, const char *file, } \ } // MPI errors are expected to be fatal before reaching this. -// Fix me : improve when adding raft comms +// FIXME : improve when adding raft comms #define MPI_TRY(cmd) \ { \ int e = cmd; \ @@ -177,7 +177,7 @@ Comm::Comm(int p) : _p{p} { CUDA_TRY(cudaGetDeviceCount(&_device_count)); _device_id = _rank % - _device_count; // FixMe : assumes each node has the same number of GPUs + _device_count; // FIXME : assumes each node has the same number of GPUs CUDA_TRY(cudaSetDevice(_device_id)); CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, @@ -214,8 +214,8 @@ Comm::Comm(ncclComm_t comm, int size, int rank) CUDA_TRY(cudaGetDeviceCount(&_device_count)); _device_id = _rank % - _device_count; // FixMe : assumes each node has the same number of GPUs - CUDA_TRY(cudaSetDevice(_device_id)); // FixMe : check if this is needed or if + _device_count; // FIXME : assumes each node has the same number of GPUs + CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if // python takes care of this CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 9c5b0e1a77e..fb3e8e23b89 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -1,12 +1,17 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020, NVIDIA CORPORATION. * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "utilities/cuda_utils.cuh" @@ -75,7 +80,7 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()) // FixMe retrieve global source + if (GraphBase::comm.get_p()) // FIXME retrieve global source // indexing for the allreduce work CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); degree_from_vertex_ids( @@ -104,7 +109,7 @@ void GraphCompressedSparseBase::degree( if (direction != DegreeDirection::IN) { if (GraphBase::comm.get_p()) CUGRAPH_FAIL( - "OPG degree not implemented for OUT degree"); // FixMe retrieve global + "OPG degree not implemented for OUT degree"); // FIXME retrieve global // source indexing for // the allreduce to work degree_from_offsets(GraphBase::number_of_vertices, offsets, diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 5041cf94528..619fcad7c62 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "test_utils.h" #include "gtest/gtest.h" #include @@ -11,6 +27,7 @@ template void ref_degree_h(std::vector &ind_h, std::vector °ree) { for (size_t i = 0; i < degree.size(); i++) + degree[i] = 0; for (size_t i = 0; i < ind_h.size(); i++) degree[ind_h[i]] += 1; From 01ba016716e579e9c57511f33c33509e9c817514 Mon Sep 17 00:00:00 2001 From: afender Date: Tue, 5 May 2020 16:01:23 -0500 Subject: [PATCH 18/19] clang2 --- cpp/include/comms_mpi.hpp | 11 +- cpp/include/graph.hpp | 95 +++++++++-------- cpp/src/comms/mpi/comms_mpi.cpp | 183 +++++++++++++++----------------- cpp/src/structure/graph.cu | 106 +++++++++--------- cpp/tests/nccl/degree_test.cu | 64 ++++++----- 5 files changed, 231 insertions(+), 228 deletions(-) diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index dd32041f2f4..c414a043efa 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -27,7 +27,7 @@ enum class ReduceOp { SUM, MAX, MIN }; // basic info about the snmg env setup class Comm { -private: + private: int _p{0}; int _rank{0}; bool _finalize_mpi{false}; @@ -47,7 +47,7 @@ class Comm { ncclComm_t _nccl_comm{}; #endif -public: + public: Comm(){}; Comm(int p); #if ENABLE_OPG @@ -67,9 +67,8 @@ class Comm { void allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const; template - void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, - ReduceOp reduce_op) const; + void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const; }; -} // namespace experimental -} // namespace cugraph +} // namespace experimental +} // namespace cugraph diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index 1c11f17311a..33dd081361f 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -31,9 +31,9 @@ struct GraphProperties { }; enum class DegreeDirection { - IN_PLUS_OUT = 0, ///> Compute sum of in and out degree - IN, ///> Compute in degree - OUT, ///> Compute out degree + IN_PLUS_OUT = 0, ///> Compute sum of in and out degree + IN, ///> Compute in degree + OUT, ///> Compute out degree DEGREE_DIRECTION_COUNT }; @@ -44,10 +44,11 @@ enum class DegreeDirection { * @tparam ET Type of edge id * @tparam WT Type of weight */ -template class GraphBase { -public: +template +class GraphBase { + public: Comm comm; - WT *edge_data; ///< edge weight + WT *edge_data; ///< edge weight GraphProperties prop; VT number_of_vertices; @@ -64,9 +65,13 @@ template class GraphBase { void set_communicator(Comm &comm_) { comm = comm_; } GraphBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : edge_data(edge_data_), comm(), prop(), - number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) {} + : edge_data(edge_data_), + comm(), + prop(), + number_of_vertices(number_of_vertices_), + number_of_edges(number_of_edges_) + { + } }; /** @@ -78,9 +83,9 @@ template class GraphBase { */ template class GraphCOO : public GraphBase { -public: - VT *src_indices{nullptr}; ///< rowInd - VT *dst_indices{nullptr}; ///< colInd + public: + VT *src_indices{nullptr}; ///< rowInd + VT *dst_indices{nullptr}; ///< colInd /** * @brief Computes degree(in, out, in+out) of all the nodes of a Graph @@ -117,11 +122,13 @@ class GraphCOO : public GraphBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOO(VT *src_indices_, VT *dst_indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphBase(edge_data_, number_of_vertices_, - number_of_edges_), - src_indices(src_indices_), dst_indices(dst_indices_) {} + GraphCOO( + VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphBase(edge_data_, number_of_vertices_, number_of_edges_), + src_indices(src_indices_), + dst_indices(dst_indices_) + { + } }; /** @@ -134,9 +141,9 @@ class GraphCOO : public GraphBase { */ template class GraphCompressedSparseBase : public GraphBase { -public: - ET *offsets{nullptr}; ///< CSR offsets - VT *indices{nullptr}; ///< CSR indices + public: + ET *offsets{nullptr}; ///< CSR offsets + VT *indices{nullptr}; ///< CSR indices /** * @brief Fill the identifiers in the array with the source vertex @@ -176,11 +183,13 @@ class GraphCompressedSparseBase : public GraphBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBase(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphBase(edge_data_, number_of_vertices_, - number_of_edges_), - offsets{offsets_}, indices{indices_} {} + GraphCompressedSparseBase( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphBase(edge_data_, number_of_vertices_, number_of_edges_), + offsets{offsets_}, + indices{indices_} + { + } }; /** @@ -192,13 +201,11 @@ class GraphCompressedSparseBase : public GraphBase { */ template class GraphCSR : public GraphCompressedSparseBase { -public: + public: /** * @brief Default constructor */ - GraphCSR() - : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) { - } + GraphCSR() : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) {} /** * @brief Wrap existing arrays representing adjacency lists in a Graph. @@ -216,11 +223,11 @@ class GraphCSR : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, - ET number_of_edges_) - : GraphCompressedSparseBase(offsets_, indices_, edge_data_, - number_of_vertices_, - number_of_edges_) {} + GraphCSR(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBase( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } }; /** @@ -232,13 +239,11 @@ class GraphCSR : public GraphCompressedSparseBase { */ template class GraphCSC : public GraphCompressedSparseBase { -public: + public: /** * @brief Default constructor */ - GraphCSC() - : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) { - } + GraphCSC() : GraphCompressedSparseBase(nullptr, nullptr, nullptr, 0, 0) {} /** * @brief Wrap existing arrays representing transposed adjacency lists in @@ -256,12 +261,12 @@ class GraphCSC : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, - ET number_of_edges_) - : GraphCompressedSparseBase(offsets_, indices_, edge_data_, - number_of_vertices_, - number_of_edges_) {} + GraphCSC(ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBase( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } }; -} // namespace experimental -} // namespace cugraph +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp index f02d207789b..f473c0a1939 100644 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ b/cpp/src/comms/mpi/comms_mpi.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "utilities/error_utils.h" #include #include #include +#include "utilities/error_utils.h" namespace cugraph { namespace experimental { @@ -31,31 +31,28 @@ struct nccl_error : public std::runtime_error { nccl_error(std::string const &message) : std::runtime_error(message) {} }; -inline void throw_nccl_error(ncclResult_t error, const char *file, - unsigned int line) { - throw nccl_error( - std::string{"NCCL error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + ncclGetErrorString(error)}); +inline void throw_nccl_error(ncclResult_t error, const char *file, unsigned int line) +{ + throw nccl_error(std::string{"NCCL error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + ncclGetErrorString(error)}); } -#define NCCL_TRY(call) \ - { \ - ncclResult_t nccl_status = (call); \ - if (nccl_status != ncclSuccess) { \ - throw_nccl_error(nccl_status, __FILE__, __LINE__); \ - } \ +#define NCCL_TRY(call) \ + { \ + ncclResult_t nccl_status = (call); \ + if (nccl_status != ncclSuccess) { throw_nccl_error(nccl_status, __FILE__, __LINE__); } \ } // MPI errors are expected to be fatal before reaching this. // FIXME : improve when adding raft comms -#define MPI_TRY(cmd) \ - { \ - int e = cmd; \ - if (e != MPI_SUCCESS) { \ - CUGRAPH_FAIL("Failed: MPI error"); \ - } \ +#define MPI_TRY(cmd) \ + { \ + int e = cmd; \ + if (e != MPI_SUCCESS) { CUGRAPH_FAIL("Failed: MPI error"); } \ } -template constexpr MPI_Datatype get_mpi_type() { +template +constexpr MPI_Datatype get_mpi_type() +{ if (std::is_integral::value) { if (std::is_signed::value) { if (sizeof(value_t) == 1) { @@ -91,7 +88,9 @@ template constexpr MPI_Datatype get_mpi_type() { } } -template constexpr ncclDataType_t get_nccl_type() { +template +constexpr ncclDataType_t get_nccl_type() +{ if (std::is_integral::value) { if (std::is_signed::value) { if (sizeof(value_t) == 1) { @@ -123,7 +122,8 @@ template constexpr ncclDataType_t get_nccl_type() { } } -constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { +constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) +{ if (reduce_op == ReduceOp::SUM) { return MPI_SUM; } else if (reduce_op == ReduceOp::MAX) { @@ -135,7 +135,8 @@ constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) { } } -constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { +constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) +{ if (reduce_op == ReduceOp::SUM) { return ncclSum; } else if (reduce_op == ReduceOp::MAX) { @@ -148,7 +149,8 @@ constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) { } #endif -Comm::Comm(int p) : _p{p} { +Comm::Comm(int p) : _p{p} +{ #if ENABLE_OPG // MPI int flag{}, mpi_world_size; @@ -158,131 +160,120 @@ Comm::Comm(int p) : _p{p} { if (flag == false) { int provided{}; MPI_TRY(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided)); - if (provided != MPI_THREAD_MULTIPLE) { - MPI_TRY(MPI_ERR_OTHER); - } + if (provided != MPI_THREAD_MULTIPLE) { MPI_TRY(MPI_ERR_OTHER); } _finalize_mpi = true; } MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank)); MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size)); - CUGRAPH_EXPECTS( - (_p == mpi_world_size), - "Invalid input arguments: p should match the number of MPI processes."); + CUGRAPH_EXPECTS((_p == mpi_world_size), + "Invalid input arguments: p should match the number of MPI processes."); _mpi_comm = MPI_COMM_WORLD; // CUDA CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = - _rank % - _device_count; // FIXME : assumes each node has the same number of GPUs + _device_id = _rank % _device_count; // FIXME : assumes each node has the same number of GPUs CUDA_TRY(cudaSetDevice(_device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, - cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - _device_id)); + CUDA_TRY( + cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute( + &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); // NCCL ncclUniqueId nccl_unique_id_p{}; - if (get_rank() == 0) { - NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); - } - MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, - _mpi_comm)); - NCCL_TRY( - ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); + if (get_rank() == 0) { NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); } + MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm)); + NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); _finalize_nccl = true; #endif } #if ENABLE_OPG -Comm::Comm(ncclComm_t comm, int size, int rank) - : _nccl_comm(comm), _p(size), _rank(rank) { - +Comm::Comm(ncclComm_t comm, int size, int rank) : _nccl_comm(comm), _p(size), _rank(rank) +{ // CUDA CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = - _rank % - _device_count; // FIXME : assumes each node has the same number of GPUs - CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if - // python takes care of this + _device_id = _rank % _device_count; // FIXME : assumes each node has the same number of GPUs + CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if + // python takes care of this - CUDA_TRY(cudaDeviceGetAttribute(&_sm_count_per_device, - cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, - _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_shared_memory_size_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - _device_id)); + CUDA_TRY( + cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); + CUDA_TRY(cudaDeviceGetAttribute( + &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); } #endif -Comm::~Comm() { +Comm::~Comm() +{ #if ENABLE_OPG // NCCL - if (_finalize_nccl) - ncclCommDestroy(_nccl_comm); + if (_finalize_nccl) ncclCommDestroy(_nccl_comm); - if (_finalize_mpi) { - MPI_Finalize(); - } + if (_finalize_mpi) { MPI_Finalize(); } #endif } -void Comm::barrier() { +void Comm::barrier() +{ #if ENABLE_OPG MPI_Barrier(MPI_COMM_WORLD); #endif } template -void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const { +void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const +{ #if ENABLE_OPG - NCCL_TRY(ncclAllGather((const void *)sendbuff, (void *)recvbuff, size, - get_nccl_type(), _nccl_comm, + NCCL_TRY(ncclAllGather((const void *)sendbuff, + (void *)recvbuff, + size, + get_nccl_type(), + _nccl_comm, cudaStreamDefault)); #endif } template -void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, - ReduceOp reduce_op) const { +void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const +{ #if ENABLE_OPG - NCCL_TRY(ncclAllReduce( - (const void *)sendbuff, (void *)recvbuff, size, get_nccl_type(), - get_nccl_reduce_op(reduce_op), _nccl_comm, cudaStreamDefault)); + NCCL_TRY(ncclAllReduce((const void *)sendbuff, + (void *)recvbuff, + size, + get_nccl_type(), + get_nccl_reduce_op(reduce_op), + _nccl_comm, + cudaStreamDefault)); #endif } // explicit -template void Comm::allgather(size_t size, int *sendbuff, - int *recvbuff) const; -template void Comm::allgather(size_t size, float *sendbuff, - float *recvbuff) const; -template void Comm::allgather(size_t size, double *sendbuff, - double *recvbuff) const; -template void Comm::allreduce(size_t size, int *sendbuff, int *recvbuff, +template void Comm::allgather(size_t size, int *sendbuff, int *recvbuff) const; +template void Comm::allgather(size_t size, float *sendbuff, float *recvbuff) const; +template void Comm::allgather(size_t size, double *sendbuff, double *recvbuff) const; +template void Comm::allreduce(size_t size, + int *sendbuff, + int *recvbuff, ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, float *sendbuff, - float *recvbuff, ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, double *sendbuff, +template void Comm::allreduce(size_t size, + float *sendbuff, + float *recvbuff, + ReduceOp reduce_op) const; +template void Comm::allreduce(size_t size, + double *sendbuff, double *recvbuff, ReduceOp reduce_op) const; -} // namespace experimental -} // namespace cugraph +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index fb3e8e23b89..510e58e9d0c 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -23,54 +23,57 @@ namespace { template -void degree_from_offsets(vertex_t number_of_vertices, edge_t const *offsets, - edge_t *degree, cudaStream_t stream) { - +void degree_from_offsets(vertex_t number_of_vertices, + edge_t const *offsets, + edge_t *degree, + cudaStream_t stream) +{ // Computes out-degree for x = 0 and x = 2 - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(number_of_vertices), - [offsets, degree] __device__(vertex_t v) { - degree[v] = offsets[v + 1] - offsets[v]; - }); + thrust::for_each( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(number_of_vertices), + [offsets, degree] __device__(vertex_t v) { degree[v] = offsets[v + 1] - offsets[v]; }); } template void degree_from_vertex_ids(const cugraph::experimental::Comm &comm, - vertex_t number_of_vertices, edge_t number_of_edges, - vertex_t const *indices, edge_t *degree, - cudaStream_t stream) { - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(number_of_edges), - [indices, degree] __device__(edge_t e) { - cugraph::atomicAdd(degree + indices[e], 1); - }); - comm.allreduce(number_of_vertices, degree, degree, - cugraph::experimental::ReduceOp::SUM); + vertex_t number_of_vertices, + edge_t number_of_edges, + vertex_t const *indices, + edge_t *degree, + cudaStream_t stream) +{ + thrust::for_each( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(number_of_edges), + [indices, degree] __device__(edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); + comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM); } -} // namespace +} // namespace namespace cugraph { namespace experimental { template -void GraphBase::get_vertex_identifiers(VT *identifiers) const { +void GraphBase::get_vertex_identifiers(VT *identifiers) const +{ cugraph::detail::sequence(number_of_vertices, identifiers); } template -void GraphCompressedSparseBase::get_source_indices( - VT *src_indices) const { +void GraphCompressedSparseBase::get_source_indices(VT *src_indices) const +{ CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified"); cugraph::detail::offsets_to_indices( - offsets, GraphBase::number_of_vertices, src_indices); + offsets, GraphBase::number_of_vertices, src_indices); } template -void GraphCOO::degree(ET *degree, DegreeDirection direction) const { +void GraphCOO::degree(ET *degree, DegreeDirection direction) const +{ // // NOTE: We assume offsets/indices are a CSR. If a CSC is passed // in then x should be modified to reflect the expected direction. @@ -80,24 +83,30 @@ void GraphCOO::degree(ET *degree, DegreeDirection direction) const { cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphBase::comm.get_p()) // FIXME retrieve global source - // indexing for the allreduce work + if (GraphBase::comm.get_p()) // FIXME retrieve global source + // indexing for the allreduce work CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); - degree_from_vertex_ids( - GraphBase::comm, GraphBase::number_of_vertices, - GraphBase::number_of_edges, src_indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, + GraphBase::number_of_vertices, + GraphBase::number_of_edges, + src_indices, + degree, + stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids( - GraphBase::comm, GraphBase::number_of_vertices, - GraphBase::number_of_edges, dst_indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, + GraphBase::number_of_vertices, + GraphBase::number_of_edges, + dst_indices, + degree, + stream); } } template -void GraphCompressedSparseBase::degree( - ET *degree, DegreeDirection direction) const { +void GraphCompressedSparseBase::degree(ET *degree, DegreeDirection direction) const +{ // // NOTE: We assume offsets/indices are a CSR. If a CSC is passed // in then x should be modified to reflect the expected direction. @@ -108,18 +117,19 @@ void GraphCompressedSparseBase::degree( if (direction != DegreeDirection::IN) { if (GraphBase::comm.get_p()) - CUGRAPH_FAIL( - "OPG degree not implemented for OUT degree"); // FIXME retrieve global - // source indexing for - // the allreduce to work - degree_from_offsets(GraphBase::number_of_vertices, offsets, - degree, stream); + CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FIXME retrieve global + // source indexing for + // the allreduce to work + degree_from_offsets(GraphBase::number_of_vertices, offsets, degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids( - GraphBase::comm, GraphBase::number_of_vertices, - GraphBase::number_of_edges, indices, degree, stream); + degree_from_vertex_ids(GraphBase::comm, + GraphBase::number_of_vertices, + GraphBase::number_of_edges, + indices, + degree, + stream); } } @@ -130,5 +140,5 @@ template class GraphCOO; template class GraphCOO; template class GraphCompressedSparseBase; template class GraphCompressedSparseBase; -} // namespace experimental -} // namespace cugraph +} // namespace experimental +} // namespace cugraph diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu index 619fcad7c62..7683874939c 100644 --- a/cpp/tests/nccl/degree_test.cu +++ b/cpp/tests/nccl/degree_test.cu @@ -14,37 +14,36 @@ * limitations under the License. */ -#include "test_utils.h" -#include "gtest/gtest.h" -#include #include -#include #include #include #include +#include +#include +#include "gtest/gtest.h" +#include "test_utils.h" // ref Degree on the host template -void ref_degree_h(std::vector &ind_h, std::vector °ree) { - for (size_t i = 0; i < degree.size(); i++) - - degree[i] = 0; - for (size_t i = 0; i < ind_h.size(); i++) - degree[ind_h[i]] += 1; +void ref_degree_h(std::vector &ind_h, std::vector °ree) +{ + for (size_t i = 0; i < degree.size(); i++) degree[i] = 0; + for (size_t i = 0; i < ind_h.size(); i++) degree[ind_h[i]] += 1; } // global to local offsets by shifting all offsets by the first offset value -template void shift_by_front(std::vector &v) { +template +void shift_by_front(std::vector &v) +{ auto start = v.front(); - for (auto i = size_t{0}; i < v.size(); ++i) - v[i] -= start; + for (auto i = size_t{0}; i < v.size(); ++i) v[i] -= start; } // 1D partitioning such as each GPU has about the same number of edges template -void opg_edge_partioning(int r, int p, std::vector &ind_h, - std::vector &part_offset, size_t &e_loc) { - +void opg_edge_partioning( + int r, int p, std::vector &ind_h, std::vector &part_offset, size_t &e_loc) +{ // set first and last partition offsets part_offset[0] = 0; part_offset[p] = ind_h.size(); @@ -53,7 +52,7 @@ void opg_edge_partioning(int r, int p, std::vector &ind_h, for (int i = 1; i < p; i++) { // get the first vertex ID of each partition auto start_nnz = i * loc_nnz; - auto start_v = 0; + auto start_v = 0; for (auto j = size_t{0}; j < ind_h.size(); ++j) { if (j >= start_nnz) { start_v = j; @@ -64,11 +63,12 @@ void opg_edge_partioning(int r, int p, std::vector &ind_h, } e_loc = part_offset[r + 1] - part_offset[r]; } -TEST(degree, success) { +TEST(degree, success) +{ int v = 6; // host - std::vector src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, + std::vector src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, dest_h = {1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1}; std::vector degree_h(v, 0.0), degree_ref(v, 0.0); @@ -83,20 +83,17 @@ TEST(degree, success) { opg_edge_partioning(i, p, src_h, part_offset, e_loc); #ifdef OPG_VERBOSE sleep(i); - for (auto j = part_offset.begin(); j != part_offset.end(); ++j) - std::cout << *j << ' '; + for (auto j = part_offset.begin(); j != part_offset.end(); ++j) std::cout << *j << ' '; std::cout << std::endl; std::cout << "eloc: " << e_loc << std::endl; #endif std::vector src_loc_h(src_h.begin() + part_offset[i], src_h.begin() + part_offset[i] + e_loc), - dest_loc_h(dest_h.begin() + part_offset[i], - dest_h.begin() + part_offset[i] + e_loc); + dest_loc_h(dest_h.begin() + part_offset[i], dest_h.begin() + part_offset[i] + e_loc); shift_by_front(src_loc_h); // print mg info - printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), - comm.get_dev()); + printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev()); // local device thrust::device_vector src_d(src_loc_h.begin(), src_loc_h.end()); @@ -104,23 +101,24 @@ TEST(degree, success) { thrust::device_vector degree_d(v); // load local chunck to cugraph - cugraph::experimental::GraphCOO G( - thrust::raw_pointer_cast(src_d.data()), - thrust::raw_pointer_cast(dest_d.data()), nullptr, degree_h.size(), e_loc); + cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), + thrust::raw_pointer_cast(dest_d.data()), + nullptr, + degree_h.size(), + e_loc); G.set_communicator(comm); // OUT degree - G.degree(thrust::raw_pointer_cast(degree_d.data()), - cugraph::experimental::DegreeDirection::IN); + G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); ref_degree_h(dest_h, degree_ref); // sleep(i); - for (size_t j = 0; j < degree_h.size(); ++j) - EXPECT_EQ(degree_ref[j], degree_h[j]); + for (size_t j = 0; j < degree_h.size(); ++j) EXPECT_EQ(degree_ref[j], degree_h[j]); std::cout << "Rank " << i << " done checking." << std::endl; } -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ testing::InitGoogleTest(&argc, argv); MPI_Init(&argc, &argv); rmmInitialize(nullptr); From 43096887507ba16d9143207007579ee2b1b04339 Mon Sep 17 00:00:00 2001 From: afender Date: Tue, 5 May 2020 16:29:28 -0500 Subject: [PATCH 19/19] fix for header issue showing up on CI --- cpp/include/comms_mpi.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp index c414a043efa..7a17bdfea4c 100644 --- a/cpp/include/comms_mpi.hpp +++ b/cpp/include/comms_mpi.hpp @@ -19,7 +19,7 @@ #include #include #endif - +#include namespace cugraph { namespace experimental {