From fd38dbbba56120099bc063634b5d89e7932dee4e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 6 May 2020 20:30:50 -0400 Subject: [PATCH 001/189] Initial commit of raft comms --- cpp/include/raft/comms.hpp | 0 cpp/include/raft/comms/comms.hpp | 87 ++++ cpp/include/raft/comms/std/std_comms.hpp | 583 ++++++++++++++++++++++ cpp/include/raft/comms/std/ucp_helper.hpp | 240 +++++++++ 4 files changed, 910 insertions(+) create mode 100644 cpp/include/raft/comms.hpp create mode 100644 cpp/include/raft/comms/comms.hpp create mode 100644 cpp/include/raft/comms/std/std_comms.hpp create mode 100644 cpp/include/raft/comms/std/ucp_helper.hpp diff --git a/cpp/include/raft/comms.hpp b/cpp/include/raft/comms.hpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp new file mode 100644 index 0000000000..0ddba4e7b8 --- /dev/null +++ b/cpp/include/raft/comms/comms.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + + +namespace raft { + +class comms { + public: + + typedef unsigned int request_t; + enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; + enum op_t { SUM, PROD, MIN, MAX }; + + /** + * The resulting status of distributed stream synchronization + */ + enum status_t { + commStatusSuccess, // Synchronization successful + commStatusError, // An error occured querying sync status + commStatusAbort + }; // A failure occurred in sync, queued operations aborted + + virtual size_t getDatatypeSize(const comms::datatype_t datatype); + + template + virtual datatype_t getDataType() const; + + virtual ~comms(); + + virtual int getSize() const = 0; + virtual int getRank() const = 0; + + virtual std::unique_ptr commSplit(int color, int key) const = 0; + + virtual void barrier() const = 0; + + virtual status_t syncStream(cudaStream_t stream) const = 0; + + virtual void isend(const void* buf, int size, int dest, int tag, + request_t* request) const = 0; + + virtual void irecv(void* buf, int size, int source, int tag, + request_t* request) const = 0; + + virtual void waitall(int count, request_t array_of_requests[]) const = 0; + + virtual void allreduce(const void* sendbuff, void* recvbuff, int count, + datatype_t datatype, op_t op, + cudaStream_t stream) const = 0; + + virtual void bcast(void* buff, int count, datatype_t datatype, int root, + cudaStream_t stream) const = 0; + + virtual void reduce(const void* sendbuff, void* recvbuff, int count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const = 0; + + virtual void allgather(const void* sendbuff, void* recvbuff, int sendcount, + datatype_t datatype, cudaStream_t stream) const = 0; + + virtual void allgatherv(const void* sendbuf, void* recvbuf, + const int recvcounts[], const int displs[], + datatype_t datatype, cudaStream_t stream) const = 0; + + virtual void reducescatter(const void* sendbuff, void* recvbuff, + int recvcount, datatype_t datatype, op_t op, + cudaStream_t stream) const = 0; +}; + +} // namespace raft diff --git a/cpp/include/raft/comms/std/std_comms.hpp b/cpp/include/raft/comms/std/std_comms.hpp new file mode 100644 index 0000000000..3ef08a8ce4 --- /dev/null +++ b/cpp/include/raft/comms/std/std_comms.hpp @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +#include +#include +#include "ucp_helper.hpp" + +#include + +constexpr bool UCX_ENABLED = true; + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + + +#define NCCL_CHECK(call) \ + do { \ + ncclResult_t status = call; \ + ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \ + ncclGetErrorString(status)); \ + } while (0) + +#define NCCL_CHECK_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (status != ncclSuccess) { \ + CUML_LOG_ERROR("NCCL call='%s' failed. Reason:%s\n", #call, \ + ncclGetErrorString(status)); \ + } \ + } while (0) + + + +namespace raft { + +namespace { + +size_t getDatatypeSize(const std_comms::datatype_t datatype) { + switch (datatype) { + case MLCommon::cumlCommunicator::CHAR: + return sizeof(char); + case MLCommon::cumlCommunicator::UINT8: + return sizeof(uint8_t); + case MLCommon::cumlCommunicator::INT: + return sizeof(int); + case MLCommon::cumlCommunicator::UINT: + return sizeof(unsigned int); + case MLCommon::cumlCommunicator::INT64: + return sizeof(int64_t); + case MLCommon::cumlCommunicator::UINT64: + return sizeof(uint64_t); + case MLCommon::cumlCommunicator::FLOAT: + return sizeof(float); + case MLCommon::cumlCommunicator::DOUBLE: + return sizeof(double); + } +} + +ncclDataType_t getNCCLDatatype( + const std_comms::datatype_t datatype) { + switch (datatype) { + case MLCommon::cumlCommunicator::CHAR: + return ncclChar; + case MLCommon::cumlCommunicator::UINT8: + return ncclUint8; + case MLCommon::cumlCommunicator::INT: + return ncclInt; + case MLCommon::cumlCommunicator::UINT: + return ncclUint32; + case MLCommon::cumlCommunicator::INT64: + return ncclInt64; + case MLCommon::cumlCommunicator::UINT64: + return ncclUint64; + case MLCommon::cumlCommunicator::FLOAT: + return ncclFloat; + case MLCommon::cumlCommunicator::DOUBLE: + return ncclDouble; + } +} + +ncclRedOp_t getNCCLOp(const std_comms::op_t op) { + switch (op) { + case MLCommon::cumlCommunicator::SUM: + return ncclSum; + case MLCommon::cumlCommunicator::PROD: + return ncclProd; + case MLCommon::cumlCommunicator::MIN: + return ncclMin; + case MLCommon::cumlCommunicator::MAX: + return ncclMax; + } +} +} // namespace + +bool ucx_enabled() { return UCX_ENABLED; } + +/** + * @brief Underlying comms, like NCCL and UCX, should be initialized and ready for use, + * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the + * ownership of the actual comms from cuml so that they can also be used outside of cuml. + * + * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is + * used to construct a cuml comms instance. UCX endpoints can be bootstrapped + * in Python using ucx-py, before being used to construct a cuML comms instance. + */ +void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, + std::shared_ptr eps, int size, int rank) { + auto communicator = std::make_shared( + std::unique_ptr( + new std_comms(comm, ucp_worker, eps, size, rank))); + handle.getImpl().setCommunicator(communicator); +} + +void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank) { + auto communicator = std::make_shared( + std::unique_ptr( + new std_comms(comm, size, rank))); + handle.getImpl().setCommunicator(communicator); +} + +void inject_comms_py_coll(cumlHandle *handle, ncclComm_t comm, int size, + int rank) { + inject_comms(*handle, comm, size, rank); +} + +void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, + void *eps, int size, int rank) { + std::shared_ptr eps_sp = + std::make_shared(new ucp_ep_h[size]); + + size_t *size_t_ep_arr = (size_t *)eps; + + for (int i = 0; i < size; i++) { + size_t ptr = size_t_ep_arr[i]; + ucp_ep_h *ucp_ep_v = (ucp_ep_h *)*eps_sp; + + if (ptr != 0) { + ucp_ep_h eps_ptr = (ucp_ep_h)size_t_ep_arr[i]; + ucp_ep_v[i] = eps_ptr; + } else { + ucp_ep_v[i] = nullptr; + } + } + + inject_comms(*handle, comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank); +} + + +/** + * @brief A cumlCommunicator implementation capable of running collective communications + * with NCCL and point-to-point-communications with UCX. Note that the latter is optional. + * + * Underlying comms, like NCCL and UCX, should be initialized and ready for use, + * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the + * ownership of the actual comms from cuml so that they can also be used outside of cuml. + * + * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is + * used to construct a cuml comms instance. UCX endpoints can be bootstrapped + * in Python using ucx-py, before being used to construct a cuML comms instance. + */ +class std_comms : public raft::comms { + public: + std_comms() = delete; + + /** + * @brief Constructor for collective + point-to-point operation. + * @param comm initialized nccl comm + * @param ucp_worker initialized ucp_worker instance + * @param eps shared pointer to array of ucp endpoints + * @param size size of the cluster + * @param rank rank of the current worker + */ + std_comms(ncclComm_t comm, ucp_worker_h ucp_worker, + std::shared_ptr eps, int size, int rank) : _nccl_comm(comm), + _ucp_worker(ucp_worker), + _ucp_eps(eps), + _size(size), + _rank(rank), + _next_request_id(0) { + initialize(); + p2p_enabled = true; + }; + + + /** + * @brief constructor for collective-only operation + * @param comm initilized nccl communicator + * @param size size of the cluster + * @param rank rank of the current worker + */ + std_comms(ncclComm_t comm, int size, int rank) + : _nccl_comm(comm), _size(size), _rank(rank) { + initialize(); + }; + + virtual ~std_comms(){ + CUDA_CHECK_NO_THROW(cudaStreamDestroy(_stream)); + + CUDA_CHECK_NO_THROW(cudaFree(_sendbuff)); + CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); + } + + size_t getDatatypeSize(const std_comms::datatype_t datatype) { + switch (datatype) { + case MLCommon::cumlCommunicator::CHAR: + return sizeof(char); + case MLCommon::cumlCommunicator::UINT8: + return sizeof(uint8_t); + case MLCommon::cumlCommunicator::INT: + return sizeof(int); + case MLCommon::cumlCommunicator::UINT: + return sizeof(unsigned int); + case MLCommon::cumlCommunicator::INT64: + return sizeof(int64_t); + case MLCommon::cumlCommunicator::UINT64: + return sizeof(uint64_t); + case MLCommon::cumlCommunicator::FLOAT: + return sizeof(float); + case MLCommon::cumlCommunicator::DOUBLE: + return sizeof(double); + } + } + + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::CHAR; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::UINT8; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::INT; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::UINT; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::INT64; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::UINT64; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::FLOAT; + } + + template <> + cumlCommunicator::datatype_t getDataType() const { + return cumlCommunicator::DOUBLE; + } + + void initialize() { + CUDA_CHECK(cudaStreamCreate(&_stream)); + + CUDA_CHECK(cudaMalloc(&_sendbuff, sizeof(int))); + CUDA_CHECK(cudaMalloc(&_recvbuff, sizeof(int))); + } + + + int getSize() const { return _size; } + + int getRank() const { return _rank; } + + std::unique_ptr + commSplit(int color, int key) const { + // Not supported by NCCL + ASSERT(false, + "ERROR: commSplit called but not yet supported in this comms " + "implementation."); + } + + void barrier() const { + CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); + CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); + + allreduce(_sendbuff, _recvbuff, 1, MLCommon::cumlCommunicator::INT, + MLCommon::cumlCommunicator::SUM, _stream); + + ASSERT(syncStream(_stream) == status_t::commStatusSuccess, + "ERROR: syncStream failed. This can be caused by a failed rank."); + } + + void get_request_id(request_t *req) const { + + request_t req_id; + + if (this->_free_requests.empty()) + req_id = this->_next_request_id++; + else { + auto it = this->_free_requests.begin(); + req_id = *it; + this->_free_requests.erase(it); + } + *req = req_id; + } + + void isend(const void *buf, int size, int dest, + int tag, request_t *request) const { + ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(p2p_enabled, + "cuML Comms instance was not initialized for point-to-point"); + + ASSERT(_ucp_worker != nullptr, + "ERROR: UCX comms not initialized on communicator."); + + get_request_id(request); + ucp_ep_h ep_ptr = (*_ucp_eps)[dest]; + + ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + + this->_ucp_handler.ucp_isend(ucp_req, ep_ptr, buf, size, tag, + default_tag_mask, getRank()); + + CUML_LOG_DEBUG( + "%d: Created send request [id=%llu], ptr=%llu, to=%llu, ep=%llu", getRank(), + (unsigned long long)*request, (unsigned long long)ucp_req->req, + (unsigned long long)dest, (unsigned long long)ep_ptr); + + _requests_in_flight.insert(std::make_pair(*request, ucp_req)); + } + + void irecv(void *buf, int size, int source, int tag, + request_t *request) const { + ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(p2p_enabled, + "cuML Comms instance was not initialized for point-to-point"); + + ASSERT(_ucp_worker != nullptr, + "ERROR: UCX comms not initialized on communicator."); + + get_request_id(request); + + ucp_ep_h ep_ptr = (*_ucp_eps)[source]; + + ucp_tag_t tag_mask = default_tag_mask; + + if (source == CUML_ANY_SOURCE) { + tag_mask = any_rank_tag_mask; + } + + ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, tag_mask, + source); + + CUML_LOG_DEBUG( + "%d: Created receive request [id=%llu], ptr=%llu, from=%llu, ep=%llu", + getRank(), (unsigned long long)*request, (unsigned long long)ucp_req->req, + (unsigned long long)source, (unsigned long long)ep_ptr); + + _requests_in_flight.insert(std::make_pair(*request, ucp_req)); + } + + void waitall(int count, + request_t array_of_requests[]) const { + ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(p2p_enabled, + "cuML Comms instance was not initialized for point-to-point"); + + ASSERT(_ucp_worker != nullptr, + "ERROR: UCX comms not initialized on communicator."); + + std::vector requests; + requests.reserve(count); + + time_t start = time(NULL); + + for (int i = 0; i < count; ++i) { + auto req_it = _requests_in_flight.find(array_of_requests[i]); + ASSERT(_requests_in_flight.end() != req_it, + "ERROR: waitall on invalid request: %d", array_of_requests[i]); + requests.push_back(req_it->second); + _free_requests.insert(req_it->first); + _requests_in_flight.erase(req_it); + } + + while (requests.size() > 0) { + time_t now = time(NULL); + + // Timeout if we have not gotten progress or completed any requests + // in 10 or more seconds. + ASSERT(now - start < 10, "Timed out waiting for requests."); + + for (std::vector::iterator it = requests.begin(); + it != requests.end();) { + bool restart = false; // resets the timeout when any progress was made + + // Causes UCP to progress through the send/recv message queue + while (_ucp_handler.ucp_progress(_ucp_worker) != 0) { + restart = true; + } + + auto req = *it; + + // If the message needs release, we know it will be sent/received + // asynchronously, so we will need to track and verify its state + if (req->needs_release) { + ASSERT(UCS_PTR_IS_PTR(req->req), + "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", + UCS_PTR_STATUS(req->req)); + ASSERT(req->req->completed == 1 || req->req->completed == 0, + "request->completed not a valid value: %d\n", + req->req->completed); + } + + // If a message was sent synchronously (eg. completed before + // `isend`/`irecv` completed) or an asynchronous message + // is complete, we can go ahead and clean it up. + if (!req->needs_release || req->req->completed == 1) { + restart = true; + CUML_LOG_DEBUG( + "%d: request completed. [ptr=%llu, num_left=%lu," + " other_rank=%d, is_send=%d, completed_immediately=%d]", + getRank(), (unsigned long long)req->req, requests.size() - 1, + req->other_rank, req->is_send_request, !req->needs_release); + + // perform cleanup + _ucp_handler.free_ucp_request(req); + + // remove from pending requests + it = requests.erase(it); + } else { + ++it; + } + // if any progress was made, reset the timeout start time + if (restart) { + start = time(NULL); + } + } + } + } + + void allreduce(const void *sendbuff, void *recvbuff, + int count, datatype_t datatype, + op_t op, cudaStream_t stream) const { + NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), + getNCCLOp(op), _nccl_comm, stream)); + } + + void bcast(void *buff, int count, datatype_t datatype, + int root, cudaStream_t stream) const { + NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, + _nccl_comm, stream)); + } + + void reduce(const void *sendbuff, void *recvbuff, + int count, datatype_t datatype, op_t op, + int root, cudaStream_t stream) const { + NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), + getNCCLOp(op), root, _nccl_comm, stream)); + } + + void allgather(const void *sendbuff, void *recvbuff, + int sendcount, datatype_t datatype, + cudaStream_t stream) const { + NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, + getNCCLDatatype(datatype), _nccl_comm, stream)); + } + + void allgatherv(const void *sendbuf, void *recvbuf, + const int recvcounts[], + const int displs[], + datatype_t datatype, + cudaStream_t stream) const { + //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf + //Listing 1 on page 4. + for (int root = 0; root < _size; ++root) + NCCL_CHECK(ncclBroadcast( + sendbuf, + static_cast(recvbuf) + displs[root] * getDatatypeSize(datatype), + recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); + } + + void reducescatter(const void *sendbuff, + void *recvbuff, int recvcount, + datatype_t datatype, op_t op, + cudaStream_t stream) const { + NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, + getNCCLDatatype(datatype), getNCCLOp(op), + _nccl_comm, stream)); + } + + status_t std_comms::syncStream( + cudaStream_t stream) const { + cudaError_t cudaErr; + ncclResult_t ncclErr, ncclAsyncErr; + while (1) { + cudaErr = cudaStreamQuery(stream); + if (cudaErr == cudaSuccess) return status_t::commStatusSuccess; + + if (cudaErr != cudaErrorNotReady) { + // An error occurred querying the status of the stream + return status_t::commStatusError; + } + + ncclErr = ncclCommGetAsyncError(_nccl_comm, &ncclAsyncErr); + if (ncclErr != ncclSuccess) { + // An error occurred retrieving the asynchronous error + return status_t::commStatusError; + } + + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + ncclErr = ncclCommAbort(_nccl_comm); + if (ncclErr != ncclSuccess) + // Caller may abort with an exception or try to re-create a new communicator. + return status_t::commStatusAbort; + } + + // Let other threads (including NCCL threads) use the CPU. + pthread_yield(); + } + } + + private: + ncclComm_t _nccl_comm; + cudaStream_t _stream; + + int *_sendbuff, *_recvbuff; + + int _size; + int _rank; + + bool p2p_enabled = false; + comms_ucp_handler _ucp_handler; + ucp_worker_h _ucp_worker; + std::shared_ptr _ucp_eps; + mutable request_t _next_request_id; + mutable std::unordered_map + _requests_in_flight; + mutable std::unordered_set _free_requests; +}; + +} // end namespace ML diff --git a/cpp/include/raft/comms/std/ucp_helper.hpp b/cpp/include/raft/comms/std/ucp_helper.hpp new file mode 100644 index 0000000000..fbb8b3e110 --- /dev/null +++ b/cpp/include/raft/comms/std/ucp_helper.hpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#pragma once + +typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); +typedef void (*dlsym_rec_free)(void *); +typedef int (*dlsym_worker_progress)(ucp_worker_h); + +typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, + ucp_datatype_t, ucp_tag_t, + ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, + ucp_datatype_t datatype, ucp_tag_t, + ucp_tag_t, ucp_tag_recv_callback_t); + +/** + * Standard UCX request object that will be passed + * around asynchronously. This object is really + * opaque and the comms layer only cares that it + * has been completed. Because cuml comms do not + * initialize the ucx application context, it doesn't + * own this object and thus it's important not to + * modify this struct. + */ +struct ucx_context { + int completed; +}; + +/** + * Wraps the `ucx_context` request and adds a few + * other fields for trace logging and cleanup. + */ +class ucp_request { + public: + struct ucx_context *req; + bool needs_release = true; + int other_rank = -1; + bool is_send_request = false; +}; + +// by default, match the whole tag +static const ucp_tag_t default_tag_mask = -1; + +// Only match the passed in tag, not the rank. This +// enables simulated multi-cast. +static const ucp_tag_t any_rank_tag_mask = 0xFFFF0000; + +// Per the MPI API, receiving from a rank of -1 denotes receiving +// from any rank that used the expected tag. +static const int UCP_ANY_RANK = -1; + +/** + * @brief Asynchronous send callback sets request to completed + */ +static void send_callback(void *request, ucs_status_t status) { + struct ucx_context *context = (struct ucx_context *)request; + context->completed = 1; +} + +/** + * @brief Asynchronous recv callback sets request to completed + */ +static void recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info_t *info) { + struct ucx_context *context = (struct ucx_context *)request; + context->completed = 1; +} + +/** + * Helper class for managing `dlopen` state and + * interacting with ucp. + */ +class comms_ucp_handler { + public: + comms_ucp_handler() { + load_ucp_handle(); + load_send_func(); + load_recv_func(); + load_free_req_func(); + load_print_info_func(); + load_worker_progress_func(); + } + + ~comms_ucp_handler() { dlclose(ucp_handle); } + + private: + void *ucp_handle; + + dlsym_print_info print_info_func; + dlsym_rec_free req_free_func; + dlsym_worker_progress worker_progress_func; + dlsym_send send_func; + dlsym_recv recv_func; + + void load_ucp_handle() { + ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); + if (!ucp_handle) { + ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); + ASSERT(ucp_handle, "Cannot open UCX library: %s\n", dlerror()); + } + // Reset any potential error + dlerror(); + } + + void assert_dlerror() { + char *error = dlerror(); + ASSERT(error == NULL, "Error loading function symbol: %s\n", error); + } + + void load_send_func() { + send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); + assert_dlerror(); + } + + void load_free_req_func() { + req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); + assert_dlerror(); + } + + void load_print_info_func() { + print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); + assert_dlerror(); + } + + void load_worker_progress_func() { + worker_progress_func = + (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + assert_dlerror(); + } + + void load_recv_func() { + recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); + assert_dlerror(); + } + + ucp_tag_t build_message_tag(int rank, int tag) const { + // keeping the rank in the lower bits enables debugging. + return ((uint32_t)tag << 31) | (uint32_t)rank; + } + + public: + int ucp_progress(ucp_worker_h worker) const { + return (*(worker_progress_func))(worker); + } + + /** + * @brief Frees any memory underlying the given ucp request object + */ + void free_ucp_request(ucp_request *request) const { + if (request->needs_release) { + request->req->completed = 0; + (*(req_free_func))(request->req); + } + free(request); + } + + /** + * @brief Asynchronously send data to the given endpoint using the given tag + */ + void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, int size, + int tag, ucp_tag_t tag_mask, int rank) const { + ucp_tag_t ucp_tag = build_message_tag(rank, tag); + + CUML_LOG_DEBUG("Sending tag: %ld", ucp_tag); + + ucs_status_ptr_t send_result = (*(send_func))( + ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context *ucp_req = (struct ucx_context *)send_result; + if (UCS_PTR_IS_ERR(send_result)) { + ASSERT(!UCS_PTR_IS_ERR(send_result), + "unable to send UCX data message (%d)\n", + UCS_PTR_STATUS(send_result)); + /** + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ + } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { + /** + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ + req->needs_release = true; + } else { + req->needs_release = false; + } + + req->other_rank = rank; + req->is_send_request = true; + req->req = ucp_req; + } + + /** + * @brief Asynchronously receive data from given endpoint with the given tag. + */ + void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, + void *buf, int size, int tag, ucp_tag_t tag_mask, + int sender_rank) const { + ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); + + CUML_LOG_DEBUG("%d: Receiving tag: %ld", ucp_tag); + + ucs_status_ptr_t recv_result = + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, + tag_mask, recv_callback); + + struct ucx_context *ucp_req = (struct ucx_context *)recv_result; + + req->req = ucp_req; + req->needs_release = true; + req->is_send_request = false; + req->other_rank = sender_rank; + + ASSERT(!UCS_PTR_IS_ERR(recv_result), + "unable to receive UCX data message (%d)\n", + UCS_PTR_STATUS(recv_result)); + } +}; From 8a6e05464db3f69d98b25b6f4ed1e866a30f7cbb Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 6 May 2020 20:34:35 -0400 Subject: [PATCH 002/189] Removing cuml verbiage --- cpp/include/raft/comms/std/std_comms.hpp | 112 +++++++++++------------ 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/cpp/include/raft/comms/std/std_comms.hpp b/cpp/include/raft/comms/std/std_comms.hpp index 3ef08a8ce4..d084274f18 100644 --- a/cpp/include/raft/comms/std/std_comms.hpp +++ b/cpp/include/raft/comms/std/std_comms.hpp @@ -70,58 +70,58 @@ namespace raft { namespace { -size_t getDatatypeSize(const std_comms::datatype_t datatype) { +size_t getDatatypeSize(const comms::datatype_t datatype) { switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: + case comms::CHAR: return sizeof(char); - case MLCommon::cumlCommunicator::UINT8: + case comms::UINT8: return sizeof(uint8_t); - case MLCommon::cumlCommunicator::INT: + case comms::INT: return sizeof(int); - case MLCommon::cumlCommunicator::UINT: + case comms::UINT: return sizeof(unsigned int); - case MLCommon::cumlCommunicator::INT64: + case comms::INT64: return sizeof(int64_t); - case MLCommon::cumlCommunicator::UINT64: + case comms::UINT64: return sizeof(uint64_t); - case MLCommon::cumlCommunicator::FLOAT: + case comms::FLOAT: return sizeof(float); - case MLCommon::cumlCommunicator::DOUBLE: + case comms::DOUBLE: return sizeof(double); } } ncclDataType_t getNCCLDatatype( - const std_comms::datatype_t datatype) { + const comms::datatype_t datatype) { switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: + case comms::CHAR: return ncclChar; - case MLCommon::cumlCommunicator::UINT8: + case comms::UINT8: return ncclUint8; - case MLCommon::cumlCommunicator::INT: + case comms::INT: return ncclInt; - case MLCommon::cumlCommunicator::UINT: + case comms::UINT: return ncclUint32; - case MLCommon::cumlCommunicator::INT64: + case comms::INT64: return ncclInt64; - case MLCommon::cumlCommunicator::UINT64: + case comms::UINT64: return ncclUint64; - case MLCommon::cumlCommunicator::FLOAT: + case comms::FLOAT: return ncclFloat; - case MLCommon::cumlCommunicator::DOUBLE: + case comms::DOUBLE: return ncclDouble; } } -ncclRedOp_t getNCCLOp(const std_comms::op_t op) { +ncclRedOp_t getNCCLOp(const comms::op_t op) { switch (op) { - case MLCommon::cumlCommunicator::SUM: + case comms::SUM: return ncclSum; - case MLCommon::cumlCommunicator::PROD: + case comms::PROD: return ncclProd; - case MLCommon::cumlCommunicator::MIN: + case comms::MIN: return ncclMin; - case MLCommon::cumlCommunicator::MAX: + case comms::MAX: return ncclMax; } } @@ -140,14 +140,14 @@ bool ucx_enabled() { return UCX_ENABLED; } */ void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int size, int rank) { - auto communicator = std::make_shared( + auto communicator = std::make_shared( std::unique_ptr( new std_comms(comm, ucp_worker, eps, size, rank))); handle.getImpl().setCommunicator(communicator); } void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank) { - auto communicator = std::make_shared( + auto communicator = std::make_shared( std::unique_ptr( new std_comms(comm, size, rank))); handle.getImpl().setCommunicator(communicator); @@ -182,7 +182,7 @@ void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, /** - * @brief A cumlCommunicator implementation capable of running collective communications + * @brief A comms implementation capable of running collective communications * with NCCL and point-to-point-communications with UCX. Note that the latter is optional. * * Underlying comms, like NCCL and UCX, should be initialized and ready for use, @@ -235,66 +235,66 @@ class std_comms : public raft::comms { CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); } - size_t getDatatypeSize(const std_comms::datatype_t datatype) { + size_t getDatatypeSize(const comms::datatype_t datatype) { switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: + case comms::CHAR: return sizeof(char); - case MLCommon::cumlCommunicator::UINT8: + case comms::UINT8: return sizeof(uint8_t); - case MLCommon::cumlCommunicator::INT: + case comms::INT: return sizeof(int); - case MLCommon::cumlCommunicator::UINT: + case comms::UINT: return sizeof(unsigned int); - case MLCommon::cumlCommunicator::INT64: + case comms::INT64: return sizeof(int64_t); - case MLCommon::cumlCommunicator::UINT64: + case comms::UINT64: return sizeof(uint64_t); - case MLCommon::cumlCommunicator::FLOAT: + case comms::FLOAT: return sizeof(float); - case MLCommon::cumlCommunicator::DOUBLE: + case comms::DOUBLE: return sizeof(double); } } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::CHAR; + comms::datatype_t getDataType() const { + return comms::CHAR; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::UINT8; + comms::datatype_t getDataType() const { + return comms::UINT8; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::INT; + comms::datatype_t getDataType() const { + return comms::INT; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::UINT; + comms::datatype_t getDataType() const { + return comms::UINT; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::INT64; + comms::datatype_t getDataType() const { + return comms::INT64; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::UINT64; + comms::datatype_t getDataType() const { + return comms::UINT64; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::FLOAT; + comms::datatype_t getDataType() const { + return comms::FLOAT; } template <> - cumlCommunicator::datatype_t getDataType() const { - return cumlCommunicator::DOUBLE; + comms::datatype_t getDataType() const { + return comms::DOUBLE; } void initialize() { @@ -309,7 +309,7 @@ class std_comms : public raft::comms { int getRank() const { return _rank; } - std::unique_ptr + std::unique_ptr commSplit(int color, int key) const { // Not supported by NCCL ASSERT(false, @@ -321,8 +321,8 @@ class std_comms : public raft::comms { CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); - allreduce(_sendbuff, _recvbuff, 1, MLCommon::cumlCommunicator::INT, - MLCommon::cumlCommunicator::SUM, _stream); + allreduce(_sendbuff, _recvbuff, 1, comms::INT, + comms::SUM, _stream); ASSERT(syncStream(_stream) == status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank."); @@ -382,10 +382,6 @@ class std_comms : public raft::comms { ucp_tag_t tag_mask = default_tag_mask; - if (source == CUML_ANY_SOURCE) { - tag_mask = any_rank_tag_mask; - } - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, tag_mask, source); From 2bfc0dc304e06c4e532fa2b7ee582b3ddfea8537 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 May 2020 20:25:17 -0400 Subject: [PATCH 003/189] Renaming to comms_t, removing logs for now, adding to handle --- cpp/include/raft/comms.hpp | 0 cpp/include/raft/comms/comms.hpp | 8 +- cpp/include/raft/comms/std/std_comms.hpp | 162 +++++++++------------- cpp/include/raft/comms/std/ucp_helper.hpp | 21 +-- cpp/include/raft/handle.hpp | 24 ++-- 5 files changed, 88 insertions(+), 127 deletions(-) delete mode 100644 cpp/include/raft/comms.hpp diff --git a/cpp/include/raft/comms.hpp b/cpp/include/raft/comms.hpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 0ddba4e7b8..b9d9c11d07 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -21,7 +21,7 @@ namespace raft { -class comms { +class comms_t { public: typedef unsigned int request_t; @@ -37,17 +37,17 @@ class comms { commStatusAbort }; // A failure occurred in sync, queued operations aborted - virtual size_t getDatatypeSize(const comms::datatype_t datatype); + virtual size_t getDatatypeSize(const comms_t::datatype_t datatype); template virtual datatype_t getDataType() const; - virtual ~comms(); + virtual ~comms_t(); virtual int getSize() const = 0; virtual int getRank() const = 0; - virtual std::unique_ptr commSplit(int color, int key) const = 0; + virtual std::unique_ptr commSplit(int color, int key) const = 0; virtual void barrier() const = 0; diff --git a/cpp/include/raft/comms/std/std_comms.hpp b/cpp/include/raft/comms/std/std_comms.hpp index d084274f18..13ad6773cb 100644 --- a/cpp/include/raft/comms/std/std_comms.hpp +++ b/cpp/include/raft/comms/std/std_comms.hpp @@ -59,7 +59,7 @@ constexpr bool UCX_ENABLED = true; do { \ ncclResult_t status = call; \ if (status != ncclSuccess) { \ - CUML_LOG_ERROR("NCCL call='%s' failed. Reason:%s\n", #call, \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, \ ncclGetErrorString(status)); \ } \ } while (0) @@ -70,58 +70,58 @@ namespace raft { namespace { -size_t getDatatypeSize(const comms::datatype_t datatype) { +size_t getDatatypeSize(const comms_t::datatype_t datatype) { switch (datatype) { - case comms::CHAR: + case comms_t::CHAR: return sizeof(char); - case comms::UINT8: + case comms_t::UINT8: return sizeof(uint8_t); - case comms::INT: + case comms_t::INT: return sizeof(int); - case comms::UINT: + case comms_t::UINT: return sizeof(unsigned int); - case comms::INT64: + case comms_t::INT64: return sizeof(int64_t); - case comms::UINT64: + case comms_t::UINT64: return sizeof(uint64_t); - case comms::FLOAT: + case comms_t::FLOAT: return sizeof(float); - case comms::DOUBLE: + case comms_t::DOUBLE: return sizeof(double); } } ncclDataType_t getNCCLDatatype( - const comms::datatype_t datatype) { + const comms_t::datatype_t datatype) { switch (datatype) { - case comms::CHAR: + case comms_t::CHAR: return ncclChar; - case comms::UINT8: + case comms_t::UINT8: return ncclUint8; - case comms::INT: + case comms_t::INT: return ncclInt; - case comms::UINT: + case comms_t::UINT: return ncclUint32; - case comms::INT64: + case comms_t::INT64: return ncclInt64; - case comms::UINT64: + case comms_t::UINT64: return ncclUint64; - case comms::FLOAT: + case comms_t::FLOAT: return ncclFloat; - case comms::DOUBLE: + case comms_t::DOUBLE: return ncclDouble; } } -ncclRedOp_t getNCCLOp(const comms::op_t op) { +ncclRedOp_t getNCCLOp(const comms_t::op_t op) { switch (op) { - case comms::SUM: + case comms_t::SUM: return ncclSum; - case comms::PROD: + case comms_t::PROD: return ncclProd; - case comms::MIN: + case comms_t::MIN: return ncclMin; - case comms::MAX: + case comms_t::MAX: return ncclMax; } } @@ -129,26 +129,17 @@ ncclRedOp_t getNCCLOp(const comms::op_t op) { bool ucx_enabled() { return UCX_ENABLED; } -/** - * @brief Underlying comms, like NCCL and UCX, should be initialized and ready for use, - * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the - * ownership of the actual comms from cuml so that they can also be used outside of cuml. - * - * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is - * used to construct a cuml comms instance. UCX endpoints can be bootstrapped - * in Python using ucx-py, before being used to construct a cuML comms instance. - */ void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( + auto communicator = std::make_shared( + std::unique_ptr( new std_comms(comm, ucp_worker, eps, size, rank))); handle.getImpl().setCommunicator(communicator); } void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( + auto communicator = std::make_shared( + std::unique_ptr( new std_comms(comm, size, rank))); handle.getImpl().setCommunicator(communicator); } @@ -181,19 +172,7 @@ void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, } -/** - * @brief A comms implementation capable of running collective communications - * with NCCL and point-to-point-communications with UCX. Note that the latter is optional. - * - * Underlying comms, like NCCL and UCX, should be initialized and ready for use, - * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the - * ownership of the actual comms from cuml so that they can also be used outside of cuml. - * - * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is - * used to construct a cuml comms instance. UCX endpoints can be bootstrapped - * in Python using ucx-py, before being used to construct a cuML comms instance. - */ -class std_comms : public raft::comms { +class std_comms : public raft::comms_t { public: std_comms() = delete; @@ -235,66 +214,66 @@ class std_comms : public raft::comms { CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); } - size_t getDatatypeSize(const comms::datatype_t datatype) { + size_t getDatatypeSize(const c::datatype_t datatype) { switch (datatype) { - case comms::CHAR: + case comms_t::CHAR: return sizeof(char); - case comms::UINT8: + case comms_t::UINT8: return sizeof(uint8_t); - case comms::INT: + case comms_t::INT: return sizeof(int); - case comms::UINT: + case comms_t::UINT: return sizeof(unsigned int); - case comms::INT64: + case comms_t::INT64: return sizeof(int64_t); - case comms::UINT64: + case comms_t::UINT64: return sizeof(uint64_t); - case comms::FLOAT: + case comms_t::FLOAT: return sizeof(float); - case comms::DOUBLE: + case comms_t::DOUBLE: return sizeof(double); } } template <> - comms::datatype_t getDataType() const { - return comms::CHAR; + comms_t::datatype_t getDataType() const { + return comms_t::CHAR; } template <> - comms::datatype_t getDataType() const { - return comms::UINT8; + comms_t::datatype_t getDataType() const { + return comms_t::UINT8; } template <> - comms::datatype_t getDataType() const { - return comms::INT; + comms_t::datatype_t getDataType() const { + return comms_t::INT; } template <> - comms::datatype_t getDataType() const { - return comms::UINT; + comms_t::datatype_t getDataType() const { + return comms_t::UINT; } template <> - comms::datatype_t getDataType() const { - return comms::INT64; + comms_t::datatype_t getDataType() const { + return comms_t::INT64; } template <> - comms::datatype_t getDataType() const { - return comms::UINT64; + comms_t::datatype_t getDataType() const { + return comms_t::UINT64; } template <> - comms::datatype_t getDataType() const { - return comms::FLOAT; + comms_t::datatype_t getDataType() const { + return comms_t::FLOAT; } template <> - comms::datatype_t getDataType() const { - return comms::DOUBLE; + comms_t::datatype_t getDataType() const { + return comms_t::DOUBLE; } void initialize() { @@ -309,7 +288,7 @@ class std_comms : public raft::comms { int getRank() const { return _rank; } - std::unique_ptr + std::unique_ptr commSplit(int color, int key) const { // Not supported by NCCL ASSERT(false, @@ -321,8 +300,8 @@ class std_comms : public raft::comms { CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); - allreduce(_sendbuff, _recvbuff, 1, comms::INT, - comms::SUM, _stream); + allreduce(_sendbuff, _recvbuff, 1, comms_t::INT, + comms_t::SUM, _stream); ASSERT(syncStream(_stream) == status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank."); @@ -344,9 +323,9 @@ class std_comms : public raft::comms { void isend(const void *buf, int size, int dest, int tag, request_t *request) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(UCX_ENABLED, "Comms not built with UCX support"); ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); + "Comms instance was not initialized for point-to-point"); ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -359,19 +338,14 @@ class std_comms : public raft::comms { this->_ucp_handler.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, getRank()); - CUML_LOG_DEBUG( - "%d: Created send request [id=%llu], ptr=%llu, to=%llu, ep=%llu", getRank(), - (unsigned long long)*request, (unsigned long long)ucp_req->req, - (unsigned long long)dest, (unsigned long long)ep_ptr); - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); } void irecv(void *buf, int size, int source, int tag, request_t *request) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(UCX_ENABLED, "Comms not built with UCX support"); ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); + "Comms instance was not initialized for point-to-point"); ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -386,19 +360,14 @@ class std_comms : public raft::comms { _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, tag_mask, source); - CUML_LOG_DEBUG( - "%d: Created receive request [id=%llu], ptr=%llu, from=%llu, ep=%llu", - getRank(), (unsigned long long)*request, (unsigned long long)ucp_req->req, - (unsigned long long)source, (unsigned long long)ep_ptr); - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); } void waitall(int count, request_t array_of_requests[]) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); + ASSERT(UCX_ENABLED, "Comms not built with UCX support"); ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); + "Comms instance was not initialized for point-to-point"); ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -452,11 +421,6 @@ class std_comms : public raft::comms { // is complete, we can go ahead and clean it up. if (!req->needs_release || req->req->completed == 1) { restart = true; - CUML_LOG_DEBUG( - "%d: request completed. [ptr=%llu, num_left=%lu," - " other_rank=%d, is_send=%d, completed_immediately=%d]", - getRank(), (unsigned long long)req->req, requests.size() - 1, - req->other_rank, req->is_send_request, !req->needs_release); // perform cleanup _ucp_handler.free_ucp_request(req); diff --git a/cpp/include/raft/comms/std/ucp_helper.hpp b/cpp/include/raft/comms/std/ucp_helper.hpp index fbb8b3e110..584bcc8a53 100644 --- a/cpp/include/raft/comms/std/ucp_helper.hpp +++ b/cpp/include/raft/comms/std/ucp_helper.hpp @@ -14,14 +14,11 @@ * limitations under the License. */ -#include +#include #include #include #include #include -#include -#include - #pragma once typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); @@ -63,14 +60,6 @@ class ucp_request { // by default, match the whole tag static const ucp_tag_t default_tag_mask = -1; -// Only match the passed in tag, not the rank. This -// enables simulated multi-cast. -static const ucp_tag_t any_rank_tag_mask = 0xFFFF0000; - -// Per the MPI API, receiving from a rank of -1 denotes receiving -// from any rank that used the expected tag. -static const int UCP_ANY_RANK = -1; - /** * @brief Asynchronous send callback sets request to completed */ @@ -182,8 +171,8 @@ class comms_ucp_handler { void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, int size, int tag, ucp_tag_t tag_mask, int rank) const { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - - CUML_LOG_DEBUG("Sending tag: %ld", ucp_tag); +// +// CUML_LOG_DEBUG("Sending tag: %ld", ucp_tag); ucs_status_ptr_t send_result = (*(send_func))( ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); @@ -219,8 +208,8 @@ class comms_ucp_handler { void *buf, int size, int tag, ucp_tag_t tag_mask, int sender_rank) const { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); - - CUML_LOG_DEBUG("%d: Receiving tag: %ld", ucp_tag); +// +// CUML_LOG_DEBUG("%d: Receiving tag: %ld", ucp_tag); ucs_status_ptr_t recv_result = (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 7c8898fd96..ed4481364e 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -149,11 +149,21 @@ class handle_t { } } - ///@todo: enable this once we have cuml-comms migrated - // void setCommunicator( - // std::shared_ptr communicator); - // const MLCommon::cumlCommunicator& getCommunicator() const; - // bool commsInitialized() const; + void setCommunicator( + std::shared_ptr communicator) { + _communicator = communicator; + } + + const comms_t& getCommunicator() const { + ASSERT(nullptr != _communicator.get(), + "ERROR: Communicator was not initialized\n"); + return *_communicator; + } + + bool commsInitialized() const { + return (nullptr != _communicator.get()); + } + const cudaDeviceProp& getDeviceProperties() const { if (!_devicePropInitialized) { @@ -181,9 +191,7 @@ class handle_t { cudaEvent_t _event; mutable cudaDeviceProp _prop; mutable bool _devicePropInitialized; - - ///@todo: enable this once we have migrated cuml-comms - //std::shared_ptr _communicator; + std::shared_ptr _communicator; void createResources() { cudaStream_t stream; From f129710c8537572fd6d454e7433565acc1392fb7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 May 2020 20:27:33 -0400 Subject: [PATCH 004/189] Looking like it's building! --- cpp/include/raft/comms/comms.hpp | 2 +- cpp/include/raft/handle.hpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index b9d9c11d07..c27ed430fd 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -40,7 +40,7 @@ class comms_t { virtual size_t getDatatypeSize(const comms_t::datatype_t datatype); template - virtual datatype_t getDataType() const; + datatype_t getDataType() const; virtual ~comms_t(); diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index ed4481364e..fab79372ff 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -34,6 +34,7 @@ #include #include #include +#include #include "allocator.hpp" #include "cudart_utils.h" From a0c13afdeb46c5cfd15c3555204b00b5bee1bb05 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 May 2020 20:38:56 -0400 Subject: [PATCH 005/189] Adding NCCL and UCX build for tests --- cpp/CMakeLists.txt | 1 + cpp/include/raft/comms/CMakeLists.txt | 41 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 cpp/include/raft/comms/CMakeLists.txt diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f5696cf121..b0783964a2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -161,6 +161,7 @@ set(CMAKE_CUDA_FLAGS # - dependencies ------------------------------------------------------------- include(cmake/Dependencies.cmake) +add_subdirectory(include/raft/comms) ############################################################################## # - include paths ------------------------------------------------------------ diff --git a/cpp/include/raft/comms/CMakeLists.txt b/cpp/include/raft/comms/CMakeLists.txt new file mode 100644 index 0000000000..41b18926ae --- /dev/null +++ b/cpp/include/raft/comms/CMakeLists.txt @@ -0,0 +1,41 @@ +# +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) +project(comms LANGUAGES CXX CUDA) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +option(WITH_UCX "Uses UCX for P2P comms" ON) + +if(NOT NCCL_PATH) + find_package(NCCL REQUIRED) +else() + message("-- Manually set NCCL PATH to ${NCCL_PATH}") + set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include) + set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so) +endif(NOT NCCL_PATH) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(UCX) +include_directories(${UCX_INCLUDE_DIRS}) +add_compile_definitions(WITH_UCX=1) + +add_definitions(-DHAVE_NCCL) +include_directories( ${NCCL_INCLUDE_DIRS} ) +list(APPEND RAFT_LINK_LIBRARIES ${NCCL_LIBRARIES}) \ No newline at end of file From cb194829f3573b80f712591f6b51b10c12df1a3d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 May 2020 21:22:57 -0400 Subject: [PATCH 006/189] Consolidating injection functions --- cpp/include/raft/comms/std/std_comms.hpp | 32 +++++++++-------------- cpp/include/raft/comms/std/ucp_helper.hpp | 4 --- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/cpp/include/raft/comms/std/std_comms.hpp b/cpp/include/raft/comms/std/std_comms.hpp index 13ad6773cb..7047914fde 100644 --- a/cpp/include/raft/comms/std/std_comms.hpp +++ b/cpp/include/raft/comms/std/std_comms.hpp @@ -129,28 +129,17 @@ ncclRedOp_t getNCCLOp(const comms_t::op_t op) { bool ucx_enabled() { return UCX_ENABLED; } -void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( - new std_comms(comm, ucp_worker, eps, size, rank))); - handle.getImpl().setCommunicator(communicator); -} - -void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( - new std_comms(comm, size, rank))); - handle.getImpl().setCommunicator(communicator); -} - -void inject_comms_py_coll(cumlHandle *handle, ncclComm_t comm, int size, +void inject_comms(handle_t *handle, ncclComm_t comm, int size, int rank) { - inject_comms(*handle, comm, size, rank); + auto communicator = std::make_shared( + std::unique_ptr( + new std_comms(comm, size, rank))); + handle->setCommunicator(communicator); } -void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, +void inject_comms(handle_t *handle, ncclComm_t comm, void *ucp_worker, void *eps, int size, int rank) { + std::shared_ptr eps_sp = std::make_shared(new ucp_ep_h[size]); @@ -168,11 +157,14 @@ void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, } } - inject_comms(*handle, comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank); + auto communicator = std::make_shared( + std::unique_ptr( + new std_comms(comm, ucp_worker, eps, size, rank))); + handle->setCommunicator(communicator); } -class std_comms : public raft::comms_t { +class std_comms : public comms_t { public: std_comms() = delete; diff --git a/cpp/include/raft/comms/std/ucp_helper.hpp b/cpp/include/raft/comms/std/ucp_helper.hpp index 584bcc8a53..2cf7bed6f8 100644 --- a/cpp/include/raft/comms/std/ucp_helper.hpp +++ b/cpp/include/raft/comms/std/ucp_helper.hpp @@ -171,8 +171,6 @@ class comms_ucp_handler { void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, int size, int tag, ucp_tag_t tag_mask, int rank) const { ucp_tag_t ucp_tag = build_message_tag(rank, tag); -// -// CUML_LOG_DEBUG("Sending tag: %ld", ucp_tag); ucs_status_ptr_t send_result = (*(send_func))( ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); @@ -208,8 +206,6 @@ class comms_ucp_handler { void *buf, int size, int tag, ucp_tag_t tag_mask, int sender_rank) const { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); -// -// CUML_LOG_DEBUG("%d: Receiving tag: %ld", ucp_tag); ucs_status_ptr_t recv_result = (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, From 15481774f75250a7de8c8a627433f146533b6240 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 12 May 2020 16:12:43 -0400 Subject: [PATCH 007/189] Making progress on Python. Need to refactor the comms interface a little bit --- cpp/include/raft/comms/CMakeLists.txt | 4 - cpp/include/raft/comms/comms.hpp | 50 ++- cpp/include/raft/comms/comms_helper.hpp | 146 ++++++++ cpp/include/raft/comms/nccl_helper.hpp | 18 + .../raft/comms/{std => }/std_comms.hpp | 134 +------ .../raft/comms/{std => }/ucp_helper.hpp | 2 +- cpp/include/raft/cudart_utils.h | 16 +- cpp/include/raft/handle.hpp | 10 +- python/raft/common/__init__.py | 0 python/raft/common/cuda.pxd | 36 ++ python/raft/common/cuda.pyx | 88 +++++ python/raft/common/handle.pxd | 38 ++ python/raft/common/handle.pyx | 108 ++++++ python/raft/dask/common/comms.py | 329 ++++++++++++++++++ python/raft/dask/common/comms_utils.pyx | 155 +++++++++ python/raft/dask/common/nccl.pyx | 233 +++++++++++++ python/raft/dask/common/ucx.py | 77 ++++ python/setup.py | 1 + 18 files changed, 1306 insertions(+), 139 deletions(-) create mode 100644 cpp/include/raft/comms/comms_helper.hpp create mode 100644 cpp/include/raft/comms/nccl_helper.hpp rename cpp/include/raft/comms/{std => }/std_comms.hpp (80%) rename cpp/include/raft/comms/{std => }/ucp_helper.hpp (99%) create mode 100644 python/raft/common/__init__.py create mode 100644 python/raft/common/cuda.pxd create mode 100644 python/raft/common/cuda.pyx create mode 100644 python/raft/common/handle.pxd create mode 100644 python/raft/common/handle.pyx create mode 100644 python/raft/dask/common/comms.py create mode 100644 python/raft/dask/common/comms_utils.pyx create mode 100644 python/raft/dask/common/nccl.pyx create mode 100644 python/raft/dask/common/ucx.py diff --git a/cpp/include/raft/comms/CMakeLists.txt b/cpp/include/raft/comms/CMakeLists.txt index 41b18926ae..734ce11812 100644 --- a/cpp/include/raft/comms/CMakeLists.txt +++ b/cpp/include/raft/comms/CMakeLists.txt @@ -19,8 +19,6 @@ project(comms LANGUAGES CXX CUDA) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -option(WITH_UCX "Uses UCX for P2P comms" ON) - if(NOT NCCL_PATH) find_package(NCCL REQUIRED) else() @@ -34,8 +32,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) find_package(UCX) include_directories(${UCX_INCLUDE_DIRS}) -add_compile_definitions(WITH_UCX=1) -add_definitions(-DHAVE_NCCL) include_directories( ${NCCL_INCLUDE_DIRS} ) list(APPEND RAFT_LINK_LIBRARIES ${NCCL_LIBRARIES}) \ No newline at end of file diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index c27ed430fd..a5152be620 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -20,6 +20,7 @@ namespace raft { +namespace comms { class comms_t { public: @@ -37,11 +38,6 @@ class comms_t { commStatusAbort }; // A failure occurred in sync, queued operations aborted - virtual size_t getDatatypeSize(const comms_t::datatype_t datatype); - - template - datatype_t getDataType() const; - virtual ~comms_t(); virtual int getSize() const = 0; @@ -84,4 +80,48 @@ class comms_t { cudaStream_t stream) const = 0; }; + +template +comms_t::datatype_t getDataType(T a); + +template <> +comms_t::datatype_t getDataType(char a) { + return comms_t::CHAR; +} + +template <> +comms_t::datatype_t getDataType(uint8_t a) { + return comms_t::UINT8; +} + +template <> +comms_t::datatype_t getDataType(int a) { + return comms_t::INT; +} + +template <> +comms_t::datatype_t getDataType(uint32_t a) { + return comms_t::UINT; +} + +template <> +comms_t::datatype_t getDataType(int64_t a) { + return comms_t::INT64; +} + +template <> +comms_t::datatype_t getDataType(uint64_t a) { + return comms_t::UINT64; +} + +template <> +comms_t::datatype_t getDataType(float a) { + return comms_t::FLOAT; +} + +template <> +comms_t::datatype_t getDataType(double a) { + return comms_t::DOUBLE; +} +} /// namespace comms } // namespace raft diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp new file mode 100644 index 0000000000..f1a5482334 --- /dev/null +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft { +namespace comms { + +void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, + int rank) { + + auto *raft_comm = new raft::comms::std_comms(comm, size, rank); + auto communicator = std::make_shared( + std::unique_ptr(raft_comm)); + handle->set_comms(communicator); +} + +void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, + void *eps, int size, int rank) { + + std::shared_ptr eps_sp = + std::make_shared(new ucp_ep_h[size]); + + size_t *size_t_ep_arr = (size_t *)eps; + + for (int i = 0; i < size; i++) { + size_t ptr = size_t_ep_arr[i]; + ucp_ep_h *ucp_ep_v = (ucp_ep_h *)*eps_sp; + + if (ptr != 0) { + ucp_ep_h eps_ptr = (ucp_ep_h)size_t_ep_arr[i]; + ucp_ep_v[i] = eps_ptr; + } else { + ucp_ep_v[i] = nullptr; + } + } + + auto communicator = std::make_shared( + std::unique_ptr( + new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank))); + handle->set_comms(communicator); +} + + +bool test_collective_allreduce(const handle_t& handle) { + const comms_t& communicator = handle.get_comms(); + + const int send = 1; + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(1, stream); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, getDataType(temp_d.data()), + comms_t::SUM, stream); + int temp_h = 0; + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + communicator.barrier(); + + std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "final_size: " << temp_h << std::endl; + + return temp_h == communicator.getSize(); +} + +bool test_pointToPoint_simple_send_recv(const handle_t& h, + int numTrials) { + const comms_t& communicator = h.get_comms(); + const int rank = communicator.getRank(); + + bool ret = true; + for (int i = 0; i < numTrials; i++) { + std::vector received_data((communicator.getSize() - 1), -1); + + std::vector requests; + requests.resize(2 * (communicator.getSize() - 1)); + int request_idx = 0; + //post receives + for (int r = 0; r < communicator.getSize(); ++r) { + if (r != rank) { + communicator.irecv(received_data.data() + request_idx, 1, r, 0, + requests.data() + request_idx); + ++request_idx; + } + } + + for (int r = 0; r < communicator.getSize(); ++r) { + if (r != rank) { + communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); + ++request_idx; + } + } + + communicator.waitall(requests.size(), requests.data()); + communicator.barrier(); + + if (communicator.getRank() == 0) { + std::cout << "=========================" << std::endl; + std::cout << "Trial " << i << std::endl; + } + + for (int printrank = 0; printrank < communicator.getSize(); ++printrank) { + if (communicator.getRank() == printrank) { + std::cout << "Rank " << communicator.getRank() << " received: ["; + for (int i = 0; i < received_data.size(); i++) { + auto rec = received_data[i]; + std::cout << rec; + if (rec == -1) ret = false; + communicator.barrier(); + if (i < received_data.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + } + + communicator.barrier(); + } + + if (communicator.getRank() == 0) + std::cout << "=========================" << std::endl; + } + + return ret; +} + +}; // namespace comms +}; // end namespace raft diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp new file mode 100644 index 0000000000..5f367bafae --- /dev/null +++ b/cpp/include/raft/comms/nccl_helper.hpp @@ -0,0 +1,18 @@ +#include + + +namespace raft { +namespace comms { +inline void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { + memcpy(id->internal, uniqueId, size); +} + +inline void get_unique_id(char *uid, int size) { + ncclUniqueId id; + ncclGetUniqueId(&id); + + memcpy(uid, id.internal, size); +} +} +} + diff --git a/cpp/include/raft/comms/std/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp similarity index 80% rename from cpp/include/raft/comms/std/std_comms.hpp rename to cpp/include/raft/comms/std_comms.hpp index 7047914fde..7906cf2027 100644 --- a/cpp/include/raft/comms/std/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -22,7 +22,7 @@ #include -#include +#include #include #include @@ -30,8 +30,6 @@ #include -constexpr bool UCX_ENABLED = true; - #include #include #include @@ -68,7 +66,7 @@ constexpr bool UCX_ENABLED = true; namespace raft { -namespace { +namespace comms { size_t getDatatypeSize(const comms_t::datatype_t datatype) { switch (datatype) { @@ -91,6 +89,9 @@ size_t getDatatypeSize(const comms_t::datatype_t datatype) { } } + + + ncclDataType_t getNCCLDatatype( const comms_t::datatype_t datatype) { switch (datatype) { @@ -125,43 +126,6 @@ ncclRedOp_t getNCCLOp(const comms_t::op_t op) { return ncclMax; } } -} // namespace - -bool ucx_enabled() { return UCX_ENABLED; } - -void inject_comms(handle_t *handle, ncclComm_t comm, int size, - int rank) { - auto communicator = std::make_shared( - std::unique_ptr( - new std_comms(comm, size, rank))); - handle->setCommunicator(communicator); -} - -void inject_comms(handle_t *handle, ncclComm_t comm, void *ucp_worker, - void *eps, int size, int rank) { - - std::shared_ptr eps_sp = - std::make_shared(new ucp_ep_h[size]); - - size_t *size_t_ep_arr = (size_t *)eps; - - for (int i = 0; i < size; i++) { - size_t ptr = size_t_ep_arr[i]; - ucp_ep_h *ucp_ep_v = (ucp_ep_h *)*eps_sp; - - if (ptr != 0) { - ucp_ep_h eps_ptr = (ucp_ep_h)size_t_ep_arr[i]; - ucp_ep_v[i] = eps_ptr; - } else { - ucp_ep_v[i] = nullptr; - } - } - - auto communicator = std::make_shared( - std::unique_ptr( - new std_comms(comm, ucp_worker, eps, size, rank))); - handle->setCommunicator(communicator); -} class std_comms : public comms_t { @@ -206,67 +170,7 @@ class std_comms : public comms_t { CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); } - size_t getDatatypeSize(const c::datatype_t datatype) { - switch (datatype) { - case comms_t::CHAR: - return sizeof(char); - case comms_t::UINT8: - return sizeof(uint8_t); - case comms_t::INT: - return sizeof(int); - case comms_t::UINT: - return sizeof(unsigned int); - case comms_t::INT64: - return sizeof(int64_t); - case comms_t::UINT64: - return sizeof(uint64_t); - case comms_t::FLOAT: - return sizeof(float); - case comms_t::DOUBLE: - return sizeof(double); - } - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::CHAR; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::UINT8; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::INT; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::UINT; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::INT64; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::UINT64; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::FLOAT; - } - - template <> - comms_t::datatype_t getDataType() const { - return comms_t::DOUBLE; - } void initialize() { CUDA_CHECK(cudaStreamCreate(&_stream)); @@ -315,9 +219,6 @@ class std_comms : public comms_t { void isend(const void *buf, int size, int dest, int tag, request_t *request) const { - ASSERT(UCX_ENABLED, "Comms not built with UCX support"); - ASSERT(p2p_enabled, - "Comms instance was not initialized for point-to-point"); ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -335,10 +236,6 @@ class std_comms : public comms_t { void irecv(void *buf, int size, int source, int tag, request_t *request) const { - ASSERT(UCX_ENABLED, "Comms not built with UCX support"); - ASSERT(p2p_enabled, - "Comms instance was not initialized for point-to-point"); - ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -357,9 +254,6 @@ class std_comms : public comms_t { void waitall(int count, request_t array_of_requests[]) const { - ASSERT(UCX_ENABLED, "Comms not built with UCX support"); - ASSERT(p2p_enabled, - "Comms instance was not initialized for point-to-point"); ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -457,18 +351,25 @@ class std_comms : public comms_t { getNCCLDatatype(datatype), _nccl_comm, stream)); } +// const void* sendbuf, void* recvbuf, +// const int recvcounts[], const int displs[], +// datatype_t datatype, cudaStream_t stream +// + void allgatherv(const void *sendbuf, void *recvbuf, const int recvcounts[], const int displs[], - datatype_t datatype, + comms_t::datatype_t datatype, cudaStream_t stream) const { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. - for (int root = 0; root < _size; ++root) + for (int root = 0; root < _size; ++root) { + size_t dtype_size = getDatatypeSize(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, - static_cast(recvbuf) + displs[root] * getDatatypeSize(datatype), + static_cast(recvbuf) + displs[root] * dtype_size, recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); + } } void reducescatter(const void *sendbuff, @@ -480,7 +381,7 @@ class std_comms : public comms_t { _nccl_comm, stream)); } - status_t std_comms::syncStream( + comms_t::status_t syncStream( cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; @@ -531,5 +432,6 @@ class std_comms : public comms_t { _requests_in_flight; mutable std::unordered_set _free_requests; }; +} -} // end namespace ML +} // end namespace raft diff --git a/cpp/include/raft/comms/std/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp similarity index 99% rename from cpp/include/raft/comms/std/ucp_helper.hpp rename to cpp/include/raft/comms/ucp_helper.hpp index 2cf7bed6f8..ee22b59101 100644 --- a/cpp/include/raft/comms/std/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include #include diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 8bd4caf121..93543d09de 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -112,14 +112,14 @@ class exception : public std::exception { // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ -// #define CUDA_CHECK_NO_THROW(call) \ -// do { \ -// cudaError_t status = call; \ -// if (status != cudaSuccess) { \ -// RAFT_LOG_ERROR("CUDA call='%s' at file=%s line=%d failed with %s ", \ -// #call, __FILE__, __LINE__, cudaGetErrorString(status)); \ -// } \ -// } while (0) + #define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t status = call; \ + if (status != cudaSuccess) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, __FILE__, __LINE__, cudaGetErrorString(status)); \ + } \ + } while (0) /** helper method to get max usable shared mem per block parameter */ inline int get_shared_memory_per_block() { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 297760c56b..055fdf9e3f 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -149,18 +149,18 @@ class handle_t { } } - void setCommunicator( - std::shared_ptr communicator) { + void set_comms( + std::shared_ptr communicator) { _communicator = communicator; } - const comms_t& getCommunicator() const { + const comms::comms_t& get_comms() const { ASSERT(nullptr != _communicator.get(), "ERROR: Communicator was not initialized\n"); return *_communicator; } - bool commsInitialized() const { + bool comms_initialized() const { return (nullptr != _communicator.get()); } @@ -175,7 +175,7 @@ class handle_t { } private: - std::shared_ptr _communicator; + std::shared_ptr _communicator; const int dev_id_; const int num_streams_; diff --git a/python/raft/common/__init__.py b/python/raft/common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/raft/common/cuda.pxd b/python/raft/common/cuda.pxd new file mode 100644 index 0000000000..e407213f44 --- /dev/null +++ b/python/raft/common/cuda.pxd @@ -0,0 +1,36 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + + +# Populate this with more typedef's (eg: events) as and when needed +cdef extern from * nogil: + ctypedef void* _Stream "cudaStream_t" + ctypedef int _Error "cudaError_t" + + +# Populate this with more runtime api method declarations as and when needed +cdef extern from "cuda_runtime_api.h" nogil: + _Error cudaStreamCreate(_Stream* s) + _Error cudaStreamDestroy(_Stream s) + _Error cudaStreamSynchronize(_Stream s) + _Error cudaGetLastError() + const char* cudaGetErrorString(_Error e) + const char* cudaGetErrorName(_Error e) diff --git a/python/raft/common/cuda.pyx b/python/raft/common/cuda.pyx new file mode 100644 index 0000000000..09f347058f --- /dev/null +++ b/python/raft/common/cuda.pyx @@ -0,0 +1,88 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import functools +from libcpp.string cimport string + + +class CudaRuntimeError(RuntimeError): + def __init__(self, extraMsg=None): + cdef _Error e = cudaGetLastError() + cdef bytes errMsg = cudaGetErrorString(e) + cdef bytes errName = cudaGetErrorName(e) + msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode()) + if extraMsg is not None: + msg += " extraMsg='%s'" % extraMsg + super(CudaRuntimeError, self).__init__(msg) + + +cdef class Stream: + """ + Stream represents a thin-wrapper around cudaStream_t and its operations. + + Examples + -------- + + .. code-block:: python + + import cuml + stream = cuml.cuda.Stream() + stream.sync() + del stream # optional! + """ + + # NOTE: + # If we store _Stream directly, this always leads to the following error: + # "Cannot convert Python object to '_Stream'" + # I was unable to find a good solution to this in reasonable time. Also, + # since cudaStream_t is a pointer anyways, storing it as an integer should + # be just fine (although, that certainly is ugly and hacky!). + cdef size_t s + + def __cinit__(self): + if self.s != 0: + return + cdef _Stream stream + cdef _Error e = cudaStreamCreate(&stream) + if e != 0: + raise CudaRuntimeError("Stream create") + self.s = stream + + def __dealloc__(self): + self.sync() + cdef _Stream stream = <_Stream>self.s + cdef _Error e = cudaStreamDestroy(stream) + if e != 0: + raise CudaRuntimeError("Stream destroy") + + def sync(self): + """ + Synchronize on the cudastream owned by this object. Note that this + could raise exception due to issues with previous asynchronous + launches + """ + cdef _Stream stream = <_Stream>self.s + cdef _Error e = cudaStreamSynchronize(stream) + if e != 0: + raise CudaRuntimeError("Stream sync") + + def getStream(self): + return self.s diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd new file mode 100644 index 0000000000..ea40495a2a --- /dev/null +++ b/python/raft/common/handle.pxd @@ -0,0 +1,38 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + + +from libcpp.memory cimport shared_ptr +cimport raft.common.cuda + + +cdef extern from "raft/mr/device/allocator.hpp" namespace "raft::mr::device" nogil: + cdef cppclass allocator: + pass + +cdef extern from "raft/handle.hpp" namespace "raft" nogil: + cdef cppclass handle_t: + handle_t() except + + handle_t(int ns) except + + void set_stream(raft.common.cuda._Stream s) except + + void set_device_allocator(shared_ptr[allocator] a) except + + raft.common.cuda._Stream get_stream() except + + int get_num_internal_streams() except + \ No newline at end of file diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx new file mode 100644 index 0000000000..75c10876ea --- /dev/null +++ b/python/raft/common/handle.pyx @@ -0,0 +1,108 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import raft +from libcpp.memory cimport shared_ptr +from raft.common.cuda cimport _Stream, _Error, cudaStreamSynchronize + +# +# cdef extern from ".cuml/common/rmmAllocatorAdapterhpp" namespace "ML" nogil: +# cdef cppclass rmmAllocatorAdapter(deviceAllocator): +# pass + +cdef class Handle: + """ + Handle is a lightweight python wrapper around the corresponding C++ class + of cumlHandle exposed by cuML's C++ interface. Refer to the header file + cuml/cuml.hpp for interface level details of this struct + + Examples + -------- + + .. code-block:: python + + import cuml + stream = cuml.cuda.Stream() + handle = cuml.Handle() + handle.setStream(stream) + handle.enableRMM() # Enable RMM as the device-side allocator + + # call ML algos here + + # final sync of all work launched in the stream of this handle + # this is same as `cuml.cuda.Stream.sync()` call, but safer in case + # the default stream inside the `cumlHandle` is being used + handle.sync() + del handle # optional! + """ + + # ML::cumlHandle doesn't have copy operator. So, use pointer for the object + # python world cannot access to this raw object directly, hence use + # 'size_t'! + cdef size_t h + + # not using __dict__ unless we need it to keep this Extension as lean as + # possible + cdef int n_streams + + def __cinit__(self, n_streams=0): + self.n_streams = n_streams + self.h = (new handle_t(n_streams)) + # cdef shared_ptr[deviceAllocator] rmmAlloc = ( + # shared_ptr[deviceAllocator](new rmmAllocatorAdapter())) + # cdef cumlHandle* h_ = self.h + # h_.setDeviceAllocator(rmmAlloc) + + def __dealloc__(self): + h_ = self.h + del h_ + + def setStream(self, stream): + cdef size_t s = stream.get_stream() + cdef handle_t* h_ = self.h + h_.set_stream(<_Stream>s) + + def sync(self): + """ + Issues a sync on the stream set for this handle. + + Once we make `cuml.cuda.Stream` as a mandatory option for creating + `cuml.Handle`, this should go away + """ + cdef handle_t* h_ = self.h + cdef _Stream stream = h_.get_stream() + cdef _Error e = cudaStreamSynchronize(stream) + if e != 0: + raise raft.cuda.CudaRuntimeError("Stream sync") + + def getHandle(self): + return self.h + + def getNumInternalStreams(self): + cdef handle_t* h_ = self.h + return h_.get_num_internal_streams() + + def __getstate__(self): + return self.n_streams + + def __setstate__(self, state): + self.n_streams = state + self.h = (new handle_t(self.n_streams)) diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py new file mode 100644 index 0000000000..7273df3f16 --- /dev/null +++ b/python/raft/dask/common/comms.py @@ -0,0 +1,329 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from cuml.nccl import nccl +from cuml.dask.common.ucx import UCX + +from .comms_utils import inject_comms_on_handle +from .comms_utils import inject_comms_on_handle_coll_only + +from .utils import parse_host_port +from cuml.common.handle import Handle + +from dask.distributed import get_worker, default_client + +import warnings + +import time +import uuid + + +def worker_state(sessionId=None): + """ + Retrieves cuML comms state on local worker for the given + sessionId, creating a new session if it does not exist. + If no session id is given, returns the state dict for all + sessions. + :param sessionId: + :return: + """ + worker = get_worker() + if not hasattr(worker, "_cuml_comm_state"): + worker._cuml_comm_state = {} + if sessionId is not None and sessionId not in worker._cuml_comm_state: + # Build state for new session and mark session creation time + worker._cuml_comm_state[sessionId] = {"ts": time.time()} + + if sessionId is not None: + return worker._cuml_comm_state[sessionId] + return worker._cuml_comm_state + + +def get_ucx(): + """ + A simple convenience wrapper to make sure UCP listener and + endpoints are only ever assigned once per worker. + """ + if "ucx" not in worker_state("ucp"): + worker_state("ucp")["ucx"] = UCX.get() + return worker_state("ucp")["ucx"] + + +def _func_ucp_listener_port(): + return get_ucx().listener_port() + + +async def _func_init_all(sessionId, uniqueId, comms_p2p, + worker_info, verbose, streams_per_handle): + + session_state = worker_state(sessionId) + session_state["nccl_uid"] = uniqueId + session_state["wid"] = worker_info[get_worker().address]["rank"] + session_state["nworkers"] = len(worker_info) + + if verbose: + print("Initializing NCCL") + start = time.time() + + _func_init_nccl(sessionId, uniqueId) + + if verbose: + elapsed = time.time() - start + print("NCCL Initialization took: %f seconds." % elapsed) + + if comms_p2p: + if verbose: + print("Initializing UCX Endpoints") + + if verbose: + start = time.time() + await _func_ucp_create_endpoints(sessionId, worker_info) + + if verbose: + elapsed = time.time() - start + print("Done initializing UCX endpoints. Took: %f seconds." % + elapsed) + print("Building handle") + + _func_build_handle_p2p(sessionId, streams_per_handle, verbose) + + if verbose: + print("Done building handle.") + + else: + _func_build_handle(sessionId, streams_per_handle, verbose) + + +def _func_init_nccl(sessionId, uniqueId): + """ + Initialize ncclComm_t on worker + :param workerId: int ID of the current worker running the function + :param nWorkers: int Number of workers in the cluster + :param uniqueId: array[byte] The NCCL unique Id generated from the + client. + """ + + wid = worker_state(sessionId)["wid"] + nWorkers = worker_state(sessionId)["nworkers"] + + try: + n = nccl() + n.init(nWorkers, uniqueId, wid) + worker_state(sessionId)["nccl"] = n + except Exception: + print("An error occurred initializing NCCL!") + + +def _func_build_handle_p2p(sessionId, streams_per_handle, verbose): + """ + Builds a cumlHandle on the current worker given the initialized comms + :param nccl_comm: ncclComm_t Initialized NCCL comm + :param eps: size_t initialized endpoints + :param nWorkers: int number of workers in cluster + :param workerId: int Rank of current worker + :return: + """ + ucp_worker = get_ucx().get_worker() + session_state = worker_state(sessionId) + + handle = Handle(streams_per_handle) + nccl_comm = session_state["nccl"] + eps = session_state["ucp_eps"] + nWorkers = session_state["nworkers"] + workerId = session_state["wid"] + + inject_comms_on_handle(handle, nccl_comm, ucp_worker, eps, + nWorkers, workerId, verbose) + + worker_state(sessionId)["handle"] = handle + + +def _func_build_handle(sessionId, streams_per_handle, verbose): + """ + Builds a cumlHandle on the current worker given the initialized comms + :param nccl_comm: ncclComm_t Initialized NCCL comm + :param nWorkers: int number of workers in cluster + :param workerId: int Rank of current worker + :return: + """ + handle = Handle(streams_per_handle) + + session_state = worker_state(sessionId) + + workerId = session_state["wid"] + nWorkers = session_state["nworkers"] + + nccl_comm = session_state["nccl"] + inject_comms_on_handle_coll_only(handle, nccl_comm, nWorkers, + workerId, verbose) + session_state["handle"] = handle + + +def _func_store_initial_state(nworkers, sessionId, uniqueId, wid): + session_state = worker_state(sessionId) + session_state["nccl_uid"] = uniqueId + session_state["wid"] = wid + session_state["nworkers"] = nworkers + + +async def _func_ucp_create_endpoints(sessionId, worker_info): + """ + Runs on each worker to create ucp endpoints to all other workers + :param sessionId: uuid unique id for this instance + :param worker_info: dict Maps worker address to rank & UCX port + :param r: float a random number to stop the function from being cached + """ + dask_worker = get_worker() + local_address = dask_worker.address + + eps = [None] * len(worker_info) + count = 1 + + for k in worker_info: + if str(k) != str(local_address): + + ip, port = parse_host_port(k) + + ep = await get_ucx().get_endpoint(ip, worker_info[k]["port"]) + + eps[worker_info[k]["rank"]] = ep + count += 1 + + worker_state(sessionId)["ucp_eps"] = eps + + +async def _func_destroy_all(sessionId, comms_p2p, verbose=False): + worker_state(sessionId)["nccl"].destroy() + del worker_state(sessionId)["nccl"] + del worker_state(sessionId)["handle"] + + +def _func_ucp_ports(client, workers): + return client.run(_func_ucp_listener_port, + workers=workers) + + +def _func_worker_ranks(workers): + """ + Builds a dictionary of { (worker_address, worker_port) : worker_rank } + """ + return dict(list(zip(workers, range(len(workers))))) + + +class CommsContext: + + """ + A base class to initialize and manage underlying NCCL and UCX + comms handles across a Dask cluster. Classes extending CommsContext + are responsible for calling `self.init()` to initialize the comms. + Classes that extend or use the CommsContext are also responsible for + calling `destroy()` to clean up the underlying comms. + + This class is not meant to be thread-safe. + """ + + def __init__(self, comms_p2p=False, client=None, verbose=False, + streams_per_handle=0): + """ + Construct a new CommsContext instance + :param comms_p2p: bool Should p2p comms be initialized? + """ + self.client = client if client is not None else default_client() + self.comms_p2p = comms_p2p + + self.streams_per_handle = streams_per_handle + + self.sessionId = uuid.uuid4().bytes + + self.nccl_initialized = False + self.ucx_initialized = False + + self.verbose = verbose + + if verbose: + print("Initializing comms!") + + def __del__(self): + if self.nccl_initialized or self.ucx_initialized: + self.destroy() + + def worker_info(self, workers): + """ + Builds a dictionary of { (worker_address, worker_port) : + (worker_rank, worker_port ) } + """ + ranks = _func_worker_ranks(workers) + ports = _func_ucp_ports(self.client, workers) \ + if self.comms_p2p else None + + output = {} + for k in ranks.keys(): + output[k] = {"rank": ranks[k]} + if self.comms_p2p: + output[k]["port"] = ports[k] + return output + + def init(self, workers=None): + """ + Initializes the underlying comms. NCCL is required but + UCX is only initialized if `comms_p2p == True` + """ + + self.worker_addresses = list(set((self.client.has_what().keys() + if workers is None else workers))) + + if self.nccl_initialized: + warnings.warn("CommsContext has already been initialized.") + return + + worker_info = self.worker_info(self.worker_addresses) + worker_info = {w: worker_info[w] for w in self.worker_addresses} + + self.uniqueId = nccl.get_unique_id() + + self.client.run(_func_init_all, + self.sessionId, + self.uniqueId, + self.comms_p2p, + worker_info, + self.verbose, + self.streams_per_handle, + workers=self.worker_addresses, + wait=True) + + self.nccl_initialized = True + + if self.comms_p2p: + self.ucx_initialized = True + + if self.verbose: + print("Initialization complete.") + + def destroy(self): + """ + Shuts down initialized comms and cleans up resources. + """ + self.client.run(_func_destroy_all, + self.sessionId, + self.comms_p2p, + self.verbose, + wait=True, + workers=self.worker_addresses) + + if self.verbose: + print("Destroying comms.") + + self.nccl_initialized = False + self.ucx_initialized = False diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx new file mode 100644 index 0000000000..099e38cf14 --- /dev/null +++ b/python/raft/dask/common/comms_utils.pyx @@ -0,0 +1,155 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from libc.stdlib cimport malloc, free +from cython.operator cimport dereference as deref + +from cpython.long cimport PyLong_AsVoidPtr + +from libcpp cimport bool + + +from libc.stdint cimport uintptr_t + +cdef extern from "nccl.h": + + cdef struct ncclComm + ctypedef ncclComm *ncclComm_t + + +cdef extern from "raft/handle.hpp" namespace "raft": + cdef cppclass handle_t: + handle_t() except + + +cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": + + cdef cppclass std_comms: + pass + + void build_comms_nccl_ucx(handle_t *handle, + ncclComm_t comm, + void *ucp_worker, + void *eps, + int size, + int rank) except + + + void build_comms_nccl_only(handle_t *handle, + ncclComm_t comm, + int size, + int rank) except + + + +cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": + + void build_comms_nccl_ucx(handle_t *handle, + ncclComm_t comm, + void *ucp_worker, + void *eps, + int size, + int rank) except + + + void build_comms_nccl_only(handle_t *handle, + ncclComm_t comm, + int size, + int rank) except + + + + bool test_collective_allreduce(const handle_t &h) except + + bool test_pointToPoint_simple_send_recv(const handle_t &h, + int numTrials) except + + + +def perform_test_comms_allreduce(handle): + """ + Performs an allreduce on the current worker + :param handle: Handle handle containing cumlCommunicator to use + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_allreduce(deref(h)) + + +def perform_test_comms_send_recv(handle, n_trials): + """ + Performs a p2p send/recv on the current worker + :param handle: Handle handle containing cumlCommunicator to use + """ + cdef const handle_t *h = handle.getHandle() + return test_pointToPoint_simple_send_recv(deref(h), n_trials) + + + +def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): + """ + Given a handle and initialized nccl comm, creates a cumlCommunicator + instance and injects it into the handle. + :param handle: Handle cumlHandle to inject comms into + :param nccl_inst: ncclComm_t initialized nccl comm + :param size: int number of workers in cluster + :param rank: int rank of current worker + """ + + cdef size_t handle_size_t = handle.getHandle() + handle_ = handle_size_t + + cdef size_t nccl_comm_size_t = nccl_inst.get_comm() + nccl_comm_ = nccl_comm_size_t + + build_comms_nccl_only(handle_, + deref(nccl_comm_), + size, + rank) + + +def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size, + rank, verbose): + """ + Given a handle and initialized comms, creates a cumlCommunicator instance + and injects it into the handle. + :param handle: Handle cumlHandle to inject comms into + :param nccl_inst: ncclComm_t initialized nccl comm + :param ucp_worker: size_t initialized ucp_worker_h instance + :param eps: size_t array of initialized ucp_ep_h instances + :param size: int number of workers in cluster + :param rank: int rank of current worker + """ + cdef size_t *ucp_eps = malloc(len(eps)*sizeof(size_t)) + + for i in range(len(eps)): + if eps[i] is not None: + ep_st = eps[i].get_ucp_endpoint() + ucp_eps[i] = ep_st + else: + ucp_eps[i] = 0 + + cdef void* ucp_worker_st = ucp_worker + + cdef size_t handle_size_t = handle.getHandle() + handle_ = handle_size_t + + cdef size_t nccl_comm_size_t = nccl_inst.get_comm() + nccl_comm_ = nccl_comm_size_t + + build_comms_nccl_ucx(handle_, + deref(nccl_comm_), + ucp_worker_st, + ucp_eps, + size, + rank) + + free(ucp_eps) diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx new file mode 100644 index 0000000000..e6e2f8fcb1 --- /dev/null +++ b/python/raft/dask/common/nccl.pyx @@ -0,0 +1,233 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from cython.operator cimport dereference as deref + +from libcpp cimport bool +from libc.stdlib cimport malloc, free + +cdef extern from "raft/include/comms/nccl_helper.hpp" namespace "raft::coms": + void get_unique_id(char *uid, int size) except + + void ncclUniqueIdFromChar(ncclUniqueId *id, + char *uniqueId, + int size) except + + +cdef extern from "nccl.h": + + cdef struct ncclComm + + ctypedef struct ncclUniqueId: + char *internal[128] + + ctypedef ncclComm *ncclComm_t + + ctypedef enum ncclResult_t: + ncclSuccess + ncclUnhandledCudaError + ncclSystemError + ncclInternalError + ncclInvalidArgument + ncclInvalidUsage + ncclNumResults + + ncclResult_t ncclCommInitRank(ncclComm_t *comm, + int nranks, + ncclUniqueId commId, + int rank) nogil + + ncclResult_t ncclGetUniqueId(ncclUniqueId *uniqueId) nogil + + ncclResult_t ncclCommUserRank(const ncclComm_t comm, int *rank) nogil + + ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int *count) nogil + + const char *ncclGetErrorString(ncclResult_t result) nogil + + ncclResult_t ncclCommAbort(ncclComm_t comm) nogil + + ncclResult_t ncclCommDestroy(ncclComm_t comm) nogil + +NCCL_UNIQUE_ID_BYTES = 128 + + +def unique_id(): + """ + Returns a new ncclUniqueId converted to a + character array that can be safely serialized + and shared to a remote worker. + :return: string a 128-byte unique id string + """ + cdef char *uid = malloc(NCCL_UNIQUE_ID_BYTES * sizeof(char)) + get_unique_id(uid, NCCL_UNIQUE_ID_BYTES) + c_str = uid[:NCCL_UNIQUE_ID_BYTES-1] + free(uid) + return c_str + + +cdef class nccl: + """ + A NCCL wrapper for initializing and closing NCCL comms + in Python. + """ + cdef ncclComm_t *comm + + cdef int size + cdef int rank + + def __cinit__(self): + self.comm = malloc(sizeof(ncclComm_t)) + + def __dealloc__(self): + + comm_ = self.comm + + if comm_ != NULL: + free(self.comm) + self.comm = NULL + + @staticmethod + def get_unique_id(): + """ + Returns a new nccl unique id + :return: string nccl unique id + """ + return unique_id() + + def init(self, nranks, commId, rank): + """ + Construct a nccl-py object + :param nranks: int size of clique + :param commId: string unique id from client + :param rank: int rank of current worker + """ + self.size = nranks + self.rank = rank + + cdef ncclUniqueId *ident = malloc(sizeof(ncclUniqueId)) + ncclUniqueIdFromChar(ident, commId, NCCL_UNIQUE_ID_BYTES) + + comm_ = self.comm + + cdef int nr = nranks + cdef int r = rank + cdef ncclResult_t result + + import time + + start = time.time() + with nogil: + result = ncclCommInitRank(comm_, nr, + deref(ident), r) + + end = time.time() + if result != ncclSuccess: + with nogil: + err_str = ncclGetErrorString(result) + print("NCCL_ERROR: %s" % err_str) + + def destroy(self): + """ + Call destroy on the underlying NCCL comm + """ + comm_ = self.comm + + cdef ncclResult_t result + if comm_ != NULL: + with nogil: + result = ncclCommDestroy(deref(comm_)) + + if result != ncclSuccess: + with nogil: + err_str = ncclGetErrorString(result) + print("NCCL_ERROR: %s" % err_str) + + free(self.comm) + self.comm = NULL + + def abort(self): + """ + Call abort on the underlying nccl comm + """ + comm_ = self.comm + cdef ncclResult_t result + if comm_ != NULL: + with nogil: + result = ncclCommAbort(deref(comm_)) + + if result != ncclSuccess: + with nogil: + err_str = ncclGetErrorString(result) + print("NCCL_ERROR: %s" % err_str) + free(comm_) + self.comm = NULL + + def cu_device(self): + """ + Get the device backing the underlying comm + :returns int device id + """ + cdef int *dev = malloc(sizeof(int)) + + comm_ = self.comm + cdef ncclResult_t result + with nogil: + result = ncclCommCuDevice(deref(comm_), dev) + + if result != ncclSuccess: + with nogil: + err_str = ncclGetErrorString(result) + print("NCCL_ERROR: %s" % err_str) + + ret = dev[0] + free(dev) + return ret + + def user_rank(self): + """ + Get the rank id of the current comm + :return: int rank + """ + + cdef int *rank = malloc(sizeof(int)) + + comm_ = self.comm + + cdef ncclResult_t result + with nogil: + result = ncclCommUserRank(deref(comm_), rank) + + if result != ncclSuccess: + with nogil: + err_str = ncclGetErrorString(result) + print("NCCL_ERROR: %s" % err_str) + + ret = rank[0] + free(rank) + return ret + + def get_comm(self): + """ + Returns the underlying nccl comm in a size_t (similar to void*). + This can be safely typecasted from size_t into ncclComm_t* + :return: size_t ncclComm_t instance + """ + return self.comm diff --git a/python/raft/dask/common/ucx.py b/python/raft/dask/common/ucx.py new file mode 100644 index 0000000000..948e1433ee --- /dev/null +++ b/python/raft/dask/common/ucx.py @@ -0,0 +1,77 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ucp + + +async def _connection_func(ep): + return 0 + + +class UCX: + """ + Singleton UCX context to encapsulate all interactions with the + UCX-py API and guarantee only a single listener & endpoints are + created by cuML on a single process. + """ + + __instance = None + + def __init__(self, listener_callback): + + self.listener_callback = listener_callback + + self._create_listener() + self._endpoints = {} + + assert UCX.__instance is None + + UCX.__instance = self + + @staticmethod + def get(listener_callback=_connection_func): + if UCX.__instance is None: + UCX(listener_callback) + return UCX.__instance + + def get_worker(self): + return ucp.get_ucp_worker() + + def _create_listener(self): + self._listener = ucp.create_listener(self.listener_callback) + + def listener_port(self): + return self._listener.port + + async def _create_endpoint(self, ip, port): + ep = await ucp.create_endpoint(ip, port) + self._endpoints[(ip, port)] = ep + return ep + + async def get_endpoint(self, ip, port): + if (ip, port) not in self._endpoints: + ep = await self._create_endpoint(ip, port) + else: + ep = self._endpoints[(ip, port)] + + return ep + + def __del__(self): + for ip_port, ep in self._endpoints.items(): + if not ep.closed(): + ep.abort() + del ep + + self._listener.close() diff --git a/python/setup.py b/python/setup.py index 4f47f41e0e..ccef769f42 100644 --- a/python/setup.py +++ b/python/setup.py @@ -96,6 +96,7 @@ include_dirs = [cuda_include_dir, numpy.get_include(), + "../cpp/include/", os.path.dirname(sysconfig.get_path("include"))] cmdclass = dict() From 5d37e9ef7b3b1f035e4de7e1dceea90b5d27a1b9 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 13:23:10 -0400 Subject: [PATCH 008/189] Cython is building! --- cpp/include/raft/comms/comms.hpp | 111 +++++++++++++++++++++--- cpp/include/raft/comms/comms_helper.hpp | 7 +- cpp/include/raft/comms/std_comms.hpp | 6 +- python/raft/dask/common/nccl.pyx | 2 +- 4 files changed, 104 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index a5152be620..e158af14e3 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -22,28 +22,32 @@ namespace raft { namespace comms { -class comms_t { + +class comms_iface { public: - typedef unsigned int request_t; - enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; - enum op_t { SUM, PROD, MIN, MAX }; - /** - * The resulting status of distributed stream synchronization - */ - enum status_t { - commStatusSuccess, // Synchronization successful - commStatusError, // An error occured querying sync status - commStatusAbort - }; // A failure occurred in sync, queued operations aborted + typedef unsigned int request_t; + enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; + enum op_t { SUM, PROD, MIN, MAX }; + + /** + * The resulting status of distributed stream synchronization + */ + enum status_t { + commStatusSuccess, // Synchronization successful + commStatusError, // An error occured querying sync status + commStatusAbort + }; // A failure occurred in sync, queued operations aborted + + - virtual ~comms_t(); + virtual ~comms_iface(); virtual int getSize() const = 0; virtual int getRank() const = 0; - virtual std::unique_ptr commSplit(int color, int key) const = 0; + virtual std::unique_ptr commSplit(int color, int key) const = 0; virtual void barrier() const = 0; @@ -80,6 +84,85 @@ class comms_t { cudaStream_t stream) const = 0; }; +class comms_t: public comms_iface { + public: + + + comms_t(std::unique_ptr impl) + : _impl(impl.release()) { + ASSERT(nullptr != _impl.get(), "ERROR: Invalid comms_iface used!"); + } + + int getSize() const { return _impl->getSize(); } + + int getRank() const { return _impl->getRank(); } + + std::unique_ptr commSplit(int color, int key) const { + return _impl->commSplit(color, key); + } + + void barrier() const { _impl->barrier(); } + + status_t syncStream( + cudaStream_t stream) const { + return _impl->syncStream(stream); + } + + void isend(const void* buf, int size, int dest, int tag, + request_t* request) const { + _impl->isend(buf, size, dest, tag, request); + } + + void irecv(void* buf, int size, int source, int tag, + request_t* request) const { + _impl->irecv(buf, size, source, tag, request); + } + + void waitall(int count, request_t array_of_requests[]) const { + _impl->waitall(count, array_of_requests); + } + + void allreduce(const void* sendbuff, void* recvbuff, + int count, datatype_t datatype, op_t op, + cudaStream_t stream) const { + _impl->allreduce(sendbuff, recvbuff, count, datatype, op, stream); + } + + void bcast(void* buff, int count, datatype_t datatype, + int root, cudaStream_t stream) const { + _impl->bcast(buff, count, datatype, root, stream); + } + + void reduce(const void* sendbuff, void* recvbuff, int count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { + _impl->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + } + + void allgather(const void* sendbuff, void* recvbuff, + int sendcount, datatype_t datatype, + cudaStream_t stream) const { + _impl->allgather(sendbuff, recvbuff, sendcount, datatype, stream); + } + + void allgatherv(const void* sendbuf, void* recvbuf, + const int recvcounts[], const int displs[], + datatype_t datatype, + cudaStream_t stream) const { + _impl->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); + } + + void reducescatter(const void* sendbuff, void* recvbuff, + int recvcount, datatype_t datatype, + op_t op, cudaStream_t stream) const { + _impl->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); + } + + private: + std::unique_ptr _impl; + +}; + template comms_t::datatype_t getDataType(T a); diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index f1a5482334..fed0710fbe 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -27,7 +27,7 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, auto *raft_comm = new raft::comms::std_comms(comm, size, rank); auto communicator = std::make_shared( - std::unique_ptr(raft_comm)); + std::unique_ptr(raft_comm)); handle->set_comms(communicator); } @@ -51,9 +51,8 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, } } - auto communicator = std::make_shared( - std::unique_ptr( - new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank))); + auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank); + auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7906cf2027..6890e8d9f0 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -128,7 +128,7 @@ ncclRedOp_t getNCCLOp(const comms_t::op_t op) { } -class std_comms : public comms_t { +class std_comms : public comms_iface { public: std_comms() = delete; @@ -184,7 +184,7 @@ class std_comms : public comms_t { int getRank() const { return _rank; } - std::unique_ptr + std::unique_ptr commSplit(int color, int key) const { // Not supported by NCCL ASSERT(false, @@ -199,7 +199,7 @@ class std_comms : public comms_t { allreduce(_sendbuff, _recvbuff, 1, comms_t::INT, comms_t::SUM, _stream); - ASSERT(syncStream(_stream) == status_t::commStatusSuccess, + ASSERT(syncStream(_stream) == comms_t::status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank."); } diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx index e6e2f8fcb1..c9d9fe0426 100644 --- a/python/raft/dask/common/nccl.pyx +++ b/python/raft/dask/common/nccl.pyx @@ -25,7 +25,7 @@ from cython.operator cimport dereference as deref from libcpp cimport bool from libc.stdlib cimport malloc, free -cdef extern from "raft/include/comms/nccl_helper.hpp" namespace "raft::coms": +cdef extern from "raft/comms/nccl_helper.hpp" namespace "raft::comms": void get_unique_id(char *uid, int size) except + void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, From bd1b87632295987aafcbc6b48b5b68d8a7b64a58 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 14:17:43 -0400 Subject: [PATCH 009/189] The comms tests pass!!! --- cpp/include/raft/comms/comms.hpp | 117 +++++------ cpp/include/raft/comms/comms_helper.hpp | 6 +- cpp/include/raft/comms/std_comms.hpp | 56 +++--- python/raft/dask/common/__init__.py | 8 + python/raft/dask/common/comms.py | 6 +- python/raft/dask/common/comms_utils.pyx | 19 +- python/raft/dask/common/utils.py | 250 ++++++++++++++++++++++++ python/raft/test/__init__.py | 0 python/raft/test/conftest.py | 50 +++++ python/raft/test/test_comms.py | 104 ++++++++++ python/setup.py | 4 +- 11 files changed, 516 insertions(+), 104 deletions(-) create mode 100644 python/raft/dask/common/utils.py create mode 100644 python/raft/test/__init__.py create mode 100644 python/raft/test/conftest.py create mode 100644 python/raft/test/test_comms.py diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index e158af14e3..f0cad3a380 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -22,25 +22,23 @@ namespace raft { namespace comms { +typedef unsigned int request_t; +enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; +enum op_t { SUM, PROD, MIN, MAX }; -class comms_iface { - public: - - - typedef unsigned int request_t; - enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; - enum op_t { SUM, PROD, MIN, MAX }; +/** + * The resulting status of distributed stream synchronization + */ +enum status_t { + commStatusSuccess, // Synchronization successful + commStatusError, // An error occured querying sync status + commStatusAbort +}; // A failure occurred in sync, queued operations aborted - /** - * The resulting status of distributed stream synchronization - */ - enum status_t { - commStatusSuccess, // Synchronization successful - commStatusError, // An error occured querying sync status - commStatusAbort - }; // A failure occurred in sync, queued operations aborted +class comms_iface { + public: virtual ~comms_iface(); @@ -88,6 +86,7 @@ class comms_t: public comms_iface { public: + comms_t(std::unique_ptr impl) : _impl(impl.release()) { ASSERT(nullptr != _impl.get(), "ERROR: Invalid comms_iface used!"); @@ -163,48 +162,50 @@ class comms_t: public comms_iface { }; - -template -comms_t::datatype_t getDataType(T a); - -template <> -comms_t::datatype_t getDataType(char a) { - return comms_t::CHAR; -} - -template <> -comms_t::datatype_t getDataType(uint8_t a) { - return comms_t::UINT8; -} - -template <> -comms_t::datatype_t getDataType(int a) { - return comms_t::INT; -} - -template <> -comms_t::datatype_t getDataType(uint32_t a) { - return comms_t::UINT; -} - -template <> -comms_t::datatype_t getDataType(int64_t a) { - return comms_t::INT64; -} - -template <> -comms_t::datatype_t getDataType(uint64_t a) { - return comms_t::UINT64; -} - -template <> -comms_t::datatype_t getDataType(float a) { - return comms_t::FLOAT; -} - -template <> -comms_t::datatype_t getDataType(double a) { - return comms_t::DOUBLE; -} +comms_iface::~comms_iface() {} + + +//template +//inline datatype_t getDataType(T a); +// +//template <> +//inline datatype_t getDataType(char a) { +// return CHAR; +//} +// +//template <> +//inline datatype_t getDataType(uint8_t a) { +// return UINT8; +//} +// +//template <> +//inline datatype_t getDataType(int a) { +// return INT; +//} +// +//template <> +//inline datatype_t getDataType(uint32_t a) { +// return UINT; +//} +// +//template <> +//inline datatype_t getDataType(int64_t a) { +// return INT64; +//} +// +//template <> +//inline datatype_t getDataType(uint64_t a) { +// return UINT64; +//} +// +//template <> +//inline datatype_t getDataType(float a) { +// return FLOAT; +//} +// +//template <> +//inline datatype_t getDataType(double a) { +// return DOUBLE; +//} } /// namespace comms } // namespace raft diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index fed0710fbe..7a033d4205 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -68,8 +68,8 @@ bool test_collective_allreduce(const handle_t& handle) { temp_d.resize(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, getDataType(temp_d.data()), - comms_t::SUM, stream); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT, + SUM, stream); int temp_h = 0; CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); @@ -91,7 +91,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, for (int i = 0; i < numTrials; i++) { std::vector received_data((communicator.getSize() - 1), -1); - std::vector requests; + std::vector requests; requests.resize(2 * (communicator.getSize() - 1)); int request_idx = 0; //post receives diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 6890e8d9f0..b185934cc6 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -68,23 +68,23 @@ namespace raft { namespace comms { -size_t getDatatypeSize(const comms_t::datatype_t datatype) { +size_t getDatatypeSize(const datatype_t datatype) { switch (datatype) { - case comms_t::CHAR: + case CHAR: return sizeof(char); - case comms_t::UINT8: + case UINT8: return sizeof(uint8_t); - case comms_t::INT: + case INT: return sizeof(int); - case comms_t::UINT: + case UINT: return sizeof(unsigned int); - case comms_t::INT64: + case INT64: return sizeof(int64_t); - case comms_t::UINT64: + case UINT64: return sizeof(uint64_t); - case comms_t::FLOAT: + case FLOAT: return sizeof(float); - case comms_t::DOUBLE: + case DOUBLE: return sizeof(double); } } @@ -93,36 +93,36 @@ size_t getDatatypeSize(const comms_t::datatype_t datatype) { ncclDataType_t getNCCLDatatype( - const comms_t::datatype_t datatype) { + const datatype_t datatype) { switch (datatype) { - case comms_t::CHAR: + case CHAR: return ncclChar; - case comms_t::UINT8: + case UINT8: return ncclUint8; - case comms_t::INT: + case INT: return ncclInt; - case comms_t::UINT: + case UINT: return ncclUint32; - case comms_t::INT64: + case INT64: return ncclInt64; - case comms_t::UINT64: + case UINT64: return ncclUint64; - case comms_t::FLOAT: + case FLOAT: return ncclFloat; - case comms_t::DOUBLE: + case DOUBLE: return ncclDouble; } } -ncclRedOp_t getNCCLOp(const comms_t::op_t op) { +ncclRedOp_t getNCCLOp(const op_t op) { switch (op) { - case comms_t::SUM: + case SUM: return ncclSum; - case comms_t::PROD: + case PROD: return ncclProd; - case comms_t::MIN: + case MIN: return ncclMin; - case comms_t::MAX: + case MAX: return ncclMax; } } @@ -196,10 +196,10 @@ class std_comms : public comms_iface { CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); - allreduce(_sendbuff, _recvbuff, 1, comms_t::INT, - comms_t::SUM, _stream); + allreduce(_sendbuff, _recvbuff, 1, INT, + SUM, _stream); - ASSERT(syncStream(_stream) == comms_t::status_t::commStatusSuccess, + ASSERT(syncStream(_stream) == status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank."); } @@ -359,7 +359,7 @@ class std_comms : public comms_iface { void allgatherv(const void *sendbuf, void *recvbuf, const int recvcounts[], const int displs[], - comms_t::datatype_t datatype, + datatype_t datatype, cudaStream_t stream) const { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. @@ -381,7 +381,7 @@ class std_comms : public comms_iface { _nccl_comm, stream)); } - comms_t::status_t syncStream( + status_t syncStream( cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index e69de29bb2..b8d7b28313 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -0,0 +1,8 @@ + + +from raft.dask.common.comms import CommsContext, worker_state + +from raft.dask.common.comms_utils import inject_comms_on_handle, \ + perform_test_comms_allreduce, perform_test_comms_send_recv, \ + inject_comms_on_handle_coll_only + diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py index 7273df3f16..54e15c54c2 100644 --- a/python/raft/dask/common/comms.py +++ b/python/raft/dask/common/comms.py @@ -13,14 +13,14 @@ # limitations under the License. # -from cuml.nccl import nccl -from cuml.dask.common.ucx import UCX +from raft.dask.common.nccl import nccl +from raft.dask.common.ucx import UCX from .comms_utils import inject_comms_on_handle from .comms_utils import inject_comms_on_handle_coll_only from .utils import parse_host_port -from cuml.common.handle import Handle +from raft.common.handle import Handle from dask.distributed import get_worker, default_client diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 099e38cf14..672c53aa28 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -42,17 +42,16 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": cdef cppclass std_comms: pass - void build_comms_nccl_ucx(handle_t *handle, - ncclComm_t comm, - void *ucp_worker, - void *eps, - int size, - int rank) except + - void build_comms_nccl_only(handle_t *handle, - ncclComm_t comm, - int size, - int rank) except + +cdef extern from "raft/comms/comms.hpp" namespace "raft::comms": + + cdef cppclass comms_t: + pass + + cdef cppclass comms_iface: + pass + + cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py new file mode 100644 index 0000000000..10b048d28d --- /dev/null +++ b/python/raft/dask/common/utils.py @@ -0,0 +1,250 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import os +import numba.cuda +import random +import time + +from dask.distributed import default_client, wait + +from asyncio import InvalidStateError + +from threading import Lock + + +def get_visible_devices(): + """ + Return a list of the CUDA_VISIBLE_DEVICES + :return: list[int] visible devices + """ + # TODO: Shouldn't have to split on every call + return os.environ["CUDA_VISIBLE_DEVICES"].split(",") + + +def device_of_devicendarray(devicendarray): + """ + Returns the device that backs memory allocated on the given + deviceNDArray + :param devicendarray: devicendarray array to check + :return: int device id + """ + dev = device_of_gpu_matrix(devicendarray) + return get_visible_devices()[dev] + + +def get_device_id(canonical_name): + """ + Given a local device id, find the actual "global" id + :param canonical_name: the local device name in CUDA_VISIBLE_DEVICES + :return: the global device id for the system + """ + dev_order = get_visible_devices() + idx = 0 + for dev in dev_order: + if dev == canonical_name: + return idx + idx += 1 + + return -1 + + +def select_device(dev, close=True): + """ + Use numbas numba to select the given device, optionally + closing and opening up a new cuda context if it fails. + :param dev: int device to select + :param close: bool close the cuda context and create new one? + """ + if numba.cuda.get_current_device().id != dev: + logging.warn("Selecting device " + str(dev)) + if close: + numba.cuda.close() + numba.cuda.select_device(dev) + if dev != numba.cuda.get_current_device().id: + logging.warn("Current device " + + str(numba.cuda.get_current_device()) + + " does not match expected " + str(dev)) + + +def get_client(client=None): + return default_client() if client is None else client + + +def parse_host_port(address): + """ + Given a string address with host/port, build a tuple(host, port) + :param address: string address to parse + :return: tuple(host, port) + """ + if '://' in address: + address = address.rsplit('://', 1)[1] + host, port = address.split(':') + port = int(port) + return host, port + + +def build_host_dict(workers): + """ + Builds a dict to map the set of ports running on each host to + the hostname. + :param workers: list(tuple(host, port)) list of worker addresses + :return: dict(host, set(port)) + """ + hosts = set(map(lambda x: parse_host_port(x), workers)) + hosts_dict = {} + for host, port in hosts: + if host not in hosts_dict: + hosts_dict[host] = set([port]) + else: + hosts_dict[host].add(port) + + return hosts_dict + + +def persist_across_workers(client, objects, workers=None): + """ + Calls persist on the 'objects' ensuring they are spread + across the workers on 'workers'. + + Parameters + ---------- + client : dask.distributed.Client + objects : list + Dask distributed objects to be persisted + workers : list or None + List of workers across which to persist objects + If None, then all workers attached to 'client' will be used + """ + if workers is None: + workers = client.has_what().keys() # Default to all workers + return client.persist(objects, workers={o: workers for o in objects}) + + +def raise_exception_from_futures(futures): + """Raises a RuntimeError if any of the futures indicates an exception""" + errs = [f.exception() for f in futures if f.exception()] + if errs: + raise RuntimeError("%d of %d worker jobs failed: %s" % ( + len(errs), len(futures), ", ".join(map(str, errs)) + )) + + +def wait_and_raise_from_futures(futures): + """ + Returns the collected futures after all the futures + have finished and do not indicate any exceptions. + """ + wait(futures) + raise_exception_from_futures(futures) + return futures + + +def raise_mg_import_exception(): + raise Exception("cuML has not been built with multiGPU support " + "enabled. Build with the --multigpu flag to" + " enable multiGPU support.") + + +class MultiHolderLock: + """ + A per-process synchronization lock allowing multiple concurrent holders + at any one time. This is used in situations where resources might be + limited and it's important that the number of concurrent users of + the resources are constained. + + This lock is serializable, but relies on a Python threading.Lock + underneath to properly synchronize internal state across threads. + Note that this lock is only intended to be used per-process and + the underlying threading.Lock will not be serialized. + """ + + def __init__(self, n): + """ + Initialize the lock + :param n : integer the maximum number of concurrent holders + """ + self.n = n + self.current_tasks = 0 + self.lock = Lock() + + def _acquire(self, blocking=True, timeout=10): + lock_acquired = False + + inner_lock_acquired = self.lock.acquire(blocking, timeout) + + if inner_lock_acquired and self.current_tasks < self.n - 1: + self.current_tasks += 1 + lock_acquired = True + self.lock.release() + + return lock_acquired + + def acquire(self, blocking=True, timeout=10): + """ + Acquire the lock. + :param blocking : bool will block if True + :param timeout : a timeout (in seconds) to wait for the lock + before failing. + :return : True if lock was acquired successfully, False otherwise + """ + + t = time.time() + + lock_acquired = self._acquire(blocking, timeout) + + while blocking and not lock_acquired: + + if time.time() - t > timeout: + raise TimeoutError() + + lock_acquired = self.acquire(blocking, timeout) + time.sleep(random.uniform(0, 0.01)) + + return lock_acquired + + def __getstate__(self): + d = self.__dict__.copy() + if "lock" in d: + del d["lock"] + return d + + def __setstate__(self, d): + d["lock"] = Lock() + self.__dict__ = d + + def release(self, blocking=True, timeout=10): + """ + Release a hold on the lock to allow another holder. Note that + while Python's threading.Lock does not have options for blocking + or timeout in release(), this lock uses a threading.Lock + internally and so will need to acquire that lock in order + to properly synchronize the underlying state. + :param blocking : bool will bock if True + :param timeout : a timeout (in seconds) to wait for the lock + before failing. + :return : True if lock was released successfully, False otherwise. + """ + + if self.current_tasks == 0: + raise InvalidStateError("Cannot release lock when no " + "concurrent tasks are executing") + + lock_acquired = self.lock.acquire(blocking, timeout) + if lock_acquired: + self.current_tasks -= 1 + self.lock.release() + return lock_acquired diff --git a/python/raft/test/__init__.py b/python/raft/test/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/raft/test/conftest.py b/python/raft/test/conftest.py new file mode 100644 index 0000000000..83ed6b5d83 --- /dev/null +++ b/python/raft/test/conftest.py @@ -0,0 +1,50 @@ +import pytest + +from dask.distributed import Client + +from dask_cuda import initialize +from dask_cuda import LocalCUDACluster + +enable_tcp_over_ucx = True +enable_nvlink = False +enable_infiniband = False + + +@pytest.fixture(scope="module") +def cluster(): + + print("Starting cluster") + cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + yield cluster + print("Closing cluster") + cluster.close() + print("Closed cluster") + + +@pytest.fixture(scope="module") +def ucx_cluster(): + initialize.initialize(create_cuda_context=True, + enable_tcp_over_ucx=enable_tcp_over_ucx, + enable_nvlink=enable_nvlink, + enable_infiniband=enable_infiniband) + cluster = LocalCUDACluster(protocol="ucx", + enable_tcp_over_ucx=enable_tcp_over_ucx, + enable_nvlink=enable_nvlink, + enable_infiniband=enable_infiniband, + ucx_net_devices="auto") + yield cluster + cluster.close() + + +@pytest.fixture() +def client(cluster): + client = Client(cluster) + yield client + client.close() + + +@pytest.fixture() +def ucx_client(ucx_cluster): + client = Client(cluster) + yield client + client.close() diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py new file mode 100644 index 0000000000..b4b6d0e7cc --- /dev/null +++ b/python/raft/test/test_comms.py @@ -0,0 +1,104 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import random + +import raft + +from dask.distributed import Client, wait + +from raft.dask.common import CommsContext, worker_state +from raft.dask.common import perform_test_comms_send_recv +from raft.dask.common import perform_test_comms_allreduce + +pytestmark = pytest.mark.mg + + +def test_comms_init_no_p2p(cluster): + + client = Client(cluster) + + try: + cb = CommsContext(comms_p2p=False) + cb.init() + + assert cb.nccl_initialized is True + assert cb.ucx_initialized is False + + finally: + + cb.destroy() + client.close() + + +def func_test_allreduce(sessionId, r): + handle = worker_state(sessionId)["handle"] + return perform_test_comms_allreduce(handle) + + +def func_test_send_recv(sessionId, n_trials, r): + handle = worker_state(sessionId)["handle"] + return perform_test_comms_send_recv(handle, n_trials) + + + +@pytest.mark.nccl +def test_allreduce(cluster): + + client = Client(cluster) + + try: + cb = CommsContext() + cb.init() + + dfs = [client.submit(func_test_allreduce, cb.sessionId, + random.random(), workers=[w]) + for w in cb.worker_addresses] + wait(dfs, timeout=5) + + assert all([x.result() for x in dfs]) + + finally: + cb.destroy() + client.close() + + +@pytest.mark.ucx +@pytest.mark.parametrize("n_trials", [1, 5]) +def test_send_recv(n_trials, ucx_cluster): + + client = Client(ucx_cluster) + + try: + + cb = CommsContext(comms_p2p=True, verbose=True) + cb.init() + + dfs = [client.submit(func_test_send_recv, + cb.sessionId, + n_trials, + random.random(), + workers=[w]) + for w in cb.worker_addresses] + + wait(dfs, timeout=5) + + assert(list(map(lambda x: x.result(), dfs))) + + finally: + cb.destroy() + client.close() diff --git a/python/setup.py b/python/setup.py index ccef769f42..107a061bd9 100644 --- a/python/setup.py +++ b/python/setup.py @@ -40,7 +40,7 @@ # - Dependencies include and lib folder setup -------------------------------- install_requires = [ - 'cython' + 'cython', ] cuda_home = get_environment_option("CUDA_HOME") @@ -92,7 +92,7 @@ # - Cython extensions build and parameters ----------------------------------- -libs = [] +libs = ["nccl", "rmm", "cusolver", "cusparse", "cublas"] include_dirs = [cuda_include_dir, numpy.get_include(), From 98496d3990ef7807f4a0a09a1688e2b287797ed9 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 14:20:39 -0400 Subject: [PATCH 010/189] Fixing flake8 style --- python/raft/dask/common/utils.py | 47 -------------------------------- python/raft/test/test_comms.py | 3 -- 2 files changed, 50 deletions(-) diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py index 10b048d28d..ff5a0b4538 100644 --- a/python/raft/dask/common/utils.py +++ b/python/raft/dask/common/utils.py @@ -13,9 +13,7 @@ # limitations under the License. # -import logging import os -import numba.cuda import random import time @@ -35,51 +33,6 @@ def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"].split(",") -def device_of_devicendarray(devicendarray): - """ - Returns the device that backs memory allocated on the given - deviceNDArray - :param devicendarray: devicendarray array to check - :return: int device id - """ - dev = device_of_gpu_matrix(devicendarray) - return get_visible_devices()[dev] - - -def get_device_id(canonical_name): - """ - Given a local device id, find the actual "global" id - :param canonical_name: the local device name in CUDA_VISIBLE_DEVICES - :return: the global device id for the system - """ - dev_order = get_visible_devices() - idx = 0 - for dev in dev_order: - if dev == canonical_name: - return idx - idx += 1 - - return -1 - - -def select_device(dev, close=True): - """ - Use numbas numba to select the given device, optionally - closing and opening up a new cuda context if it fails. - :param dev: int device to select - :param close: bool close the cuda context and create new one? - """ - if numba.cuda.get_current_device().id != dev: - logging.warn("Selecting device " + str(dev)) - if close: - numba.cuda.close() - numba.cuda.select_device(dev) - if dev != numba.cuda.get_current_device().id: - logging.warn("Current device " + - str(numba.cuda.get_current_device()) + - " does not match expected " + str(dev)) - - def get_client(client=None): return default_client() if client is None else client diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index b4b6d0e7cc..226a471f3b 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -17,8 +17,6 @@ import random -import raft - from dask.distributed import Client, wait from raft.dask.common import CommsContext, worker_state @@ -55,7 +53,6 @@ def func_test_send_recv(sessionId, n_trials, r): return perform_test_comms_send_recv(handle, n_trials) - @pytest.mark.nccl def test_allreduce(cluster): From fb52f79dd065bca6718ada7881ebd8df53f43371 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 14:32:49 -0400 Subject: [PATCH 011/189] Running clang format and removing getDataType --- cpp/include/raft/comms/comms.hpp | 197 +++++++++--------------- cpp/include/raft/comms/comms_helper.hpp | 34 ++-- cpp/include/raft/comms/nccl_helper.hpp | 6 +- cpp/include/raft/comms/std_comms.hpp | 142 +++++++---------- cpp/include/raft/comms/ucp_helper.hpp | 2 +- 5 files changed, 148 insertions(+), 233 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index f0cad3a380..fafcce03e1 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -18,7 +18,6 @@ #include - namespace raft { namespace comms { @@ -27,19 +26,16 @@ enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; enum op_t { SUM, PROD, MIN, MAX }; /** - * The resulting status of distributed stream synchronization - */ + * The resulting status of distributed stream synchronization + */ enum status_t { - commStatusSuccess, // Synchronization successful - commStatusError, // An error occured querying sync status - commStatusAbort + commStatusSuccess, // Synchronization successful + commStatusError, // An error occured querying sync status + commStatusAbort }; // A failure occurred in sync, queued operations aborted - - class comms_iface { public: - virtual ~comms_iface(); virtual int getSize() const = 0; @@ -82,130 +78,77 @@ class comms_iface { cudaStream_t stream) const = 0; }; -class comms_t: public comms_iface { +class comms_t : public comms_iface { public: - - - - comms_t(std::unique_ptr impl) - : _impl(impl.release()) { - ASSERT(nullptr != _impl.get(), "ERROR: Invalid comms_iface used!"); - } - - int getSize() const { return _impl->getSize(); } - - int getRank() const { return _impl->getRank(); } - - std::unique_ptr commSplit(int color, int key) const { - return _impl->commSplit(color, key); - } - - void barrier() const { _impl->barrier(); } - - status_t syncStream( - cudaStream_t stream) const { - return _impl->syncStream(stream); - } - - void isend(const void* buf, int size, int dest, int tag, - request_t* request) const { - _impl->isend(buf, size, dest, tag, request); - } - - void irecv(void* buf, int size, int source, int tag, - request_t* request) const { - _impl->irecv(buf, size, source, tag, request); - } - - void waitall(int count, request_t array_of_requests[]) const { - _impl->waitall(count, array_of_requests); - } - - void allreduce(const void* sendbuff, void* recvbuff, - int count, datatype_t datatype, op_t op, - cudaStream_t stream) const { - _impl->allreduce(sendbuff, recvbuff, count, datatype, op, stream); - } - - void bcast(void* buff, int count, datatype_t datatype, - int root, cudaStream_t stream) const { - _impl->bcast(buff, count, datatype, root, stream); - } - - void reduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - _impl->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); - } - - void allgather(const void* sendbuff, void* recvbuff, - int sendcount, datatype_t datatype, - cudaStream_t stream) const { - _impl->allgather(sendbuff, recvbuff, sendcount, datatype, stream); - } - - void allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], const int displs[], - datatype_t datatype, - cudaStream_t stream) const { - _impl->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); - } - - void reducescatter(const void* sendbuff, void* recvbuff, - int recvcount, datatype_t datatype, - op_t op, cudaStream_t stream) const { - _impl->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); - } + comms_t(std::unique_ptr impl) : _impl(impl.release()) { + ASSERT(nullptr != _impl.get(), "ERROR: Invalid comms_iface used!"); + } + + int getSize() const { return _impl->getSize(); } + + int getRank() const { return _impl->getRank(); } + + std::unique_ptr commSplit(int color, int key) const { + return _impl->commSplit(color, key); + } + + void barrier() const { _impl->barrier(); } + + status_t syncStream(cudaStream_t stream) const { + return _impl->syncStream(stream); + } + + void isend(const void* buf, int size, int dest, int tag, + request_t* request) const { + _impl->isend(buf, size, dest, tag, request); + } + + void irecv(void* buf, int size, int source, int tag, + request_t* request) const { + _impl->irecv(buf, size, source, tag, request); + } + + void waitall(int count, request_t array_of_requests[]) const { + _impl->waitall(count, array_of_requests); + } + + void allreduce(const void* sendbuff, void* recvbuff, int count, + datatype_t datatype, op_t op, cudaStream_t stream) const { + _impl->allreduce(sendbuff, recvbuff, count, datatype, op, stream); + } + + void bcast(void* buff, int count, datatype_t datatype, int root, + cudaStream_t stream) const { + _impl->bcast(buff, count, datatype, root, stream); + } + + void reduce(const void* sendbuff, void* recvbuff, int count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { + _impl->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + } + + void allgather(const void* sendbuff, void* recvbuff, int sendcount, + datatype_t datatype, cudaStream_t stream) const { + _impl->allgather(sendbuff, recvbuff, sendcount, datatype, stream); + } + + void allgatherv(const void* sendbuf, void* recvbuf, const int recvcounts[], + const int displs[], datatype_t datatype, + cudaStream_t stream) const { + _impl->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); + } + + void reducescatter(const void* sendbuff, void* recvbuff, int recvcount, + datatype_t datatype, op_t op, cudaStream_t stream) const { + _impl->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); + } private: std::unique_ptr _impl; - }; comms_iface::~comms_iface() {} - -//template -//inline datatype_t getDataType(T a); -// -//template <> -//inline datatype_t getDataType(char a) { -// return CHAR; -//} -// -//template <> -//inline datatype_t getDataType(uint8_t a) { -// return UINT8; -//} -// -//template <> -//inline datatype_t getDataType(int a) { -// return INT; -//} -// -//template <> -//inline datatype_t getDataType(uint32_t a) { -// return UINT; -//} -// -//template <> -//inline datatype_t getDataType(int64_t a) { -// return INT64; -//} -// -//template <> -//inline datatype_t getDataType(uint64_t a) { -// return UINT64; -//} -// -//template <> -//inline datatype_t getDataType(float a) { -// return FLOAT; -//} -// -//template <> -//inline datatype_t getDataType(double a) { -// return DOUBLE; -//} -} /// namespace comms +} // namespace comms } // namespace raft diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 7a033d4205..33b45a761c 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -14,26 +14,24 @@ * limitations under the License. */ -#include +#include #include +#include #include -#include namespace raft { namespace comms { void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, - int rank) { - + int rank) { auto *raft_comm = new raft::comms::std_comms(comm, size, rank); - auto communicator = std::make_shared( - std::unique_ptr(raft_comm)); + auto communicator = + std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, - void *eps, int size, int rank) { - + void *eps, int size, int rank) { std::shared_ptr eps_sp = std::make_shared(new ucp_ep_h[size]); @@ -51,14 +49,15 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, } } - auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank); - auto communicator = std::make_shared(std::unique_ptr(raft_comm)); + auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, + eps_sp, size, rank); + auto communicator = + std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } - -bool test_collective_allreduce(const handle_t& handle) { - const comms_t& communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t &handle) { + const comms_t &communicator = handle.get_comms(); const int send = 1; @@ -68,8 +67,8 @@ bool test_collective_allreduce(const handle_t& handle) { temp_d.resize(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT, - SUM, stream); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT, SUM, + stream); int temp_h = 0; CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); @@ -82,9 +81,8 @@ bool test_collective_allreduce(const handle_t& handle) { return temp_h == communicator.getSize(); } -bool test_pointToPoint_simple_send_recv(const handle_t& h, - int numTrials) { - const comms_t& communicator = h.get_comms(); +bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { + const comms_t &communicator = h.get_comms(); const int rank = communicator.getRank(); bool ret = true; diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp index 5f367bafae..ea70896e32 100644 --- a/cpp/include/raft/comms/nccl_helper.hpp +++ b/cpp/include/raft/comms/nccl_helper.hpp @@ -1,6 +1,5 @@ #include - namespace raft { namespace comms { inline void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { @@ -13,6 +12,5 @@ inline void get_unique_id(char *uid, int size) { memcpy(uid, id.internal, size); } -} -} - +} // namespace comms +} // namespace raft diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index b185934cc6..5e2be6ce77 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -34,10 +34,10 @@ #include #include #include -#include #include #include #include +#include #include @@ -45,7 +45,6 @@ #include - #define NCCL_CHECK(call) \ do { \ ncclResult_t status = call; \ @@ -53,17 +52,15 @@ ncclGetErrorString(status)); \ } while (0) -#define NCCL_CHECK_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (status != ncclSuccess) { \ +#define NCCL_CHECK_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (status != ncclSuccess) { \ printf("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ + ncclGetErrorString(status)); \ + } \ } while (0) - - namespace raft { namespace comms { @@ -89,11 +86,7 @@ size_t getDatatypeSize(const datatype_t datatype) { } } - - - -ncclDataType_t getNCCLDatatype( - const datatype_t datatype) { +ncclDataType_t getNCCLDatatype(const datatype_t datatype) { switch (datatype) { case CHAR: return ncclChar; @@ -127,7 +120,6 @@ ncclRedOp_t getNCCLOp(const op_t op) { } } - class std_comms : public comms_iface { public: std_comms() = delete; @@ -141,16 +133,16 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(ncclComm_t comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int size, int rank) : _nccl_comm(comm), - _ucp_worker(ucp_worker), - _ucp_eps(eps), - _size(size), - _rank(rank), - _next_request_id(0) { - initialize(); - p2p_enabled = true; - }; - + std::shared_ptr eps, int size, int rank) + : _nccl_comm(comm), + _ucp_worker(ucp_worker), + _ucp_eps(eps), + _size(size), + _rank(rank), + _next_request_id(0) { + initialize(); + p2p_enabled = true; + }; /** * @brief constructor for collective-only operation @@ -159,18 +151,16 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(ncclComm_t comm, int size, int rank) - : _nccl_comm(comm), _size(size), _rank(rank) { - initialize(); + : _nccl_comm(comm), _size(size), _rank(rank) { + initialize(); }; - virtual ~std_comms(){ - CUDA_CHECK_NO_THROW(cudaStreamDestroy(_stream)); - - CUDA_CHECK_NO_THROW(cudaFree(_sendbuff)); - CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); - } - + virtual ~std_comms() { + CUDA_CHECK_NO_THROW(cudaStreamDestroy(_stream)); + CUDA_CHECK_NO_THROW(cudaFree(_sendbuff)); + CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); + } void initialize() { CUDA_CHECK(cudaStreamCreate(&_stream)); @@ -179,13 +169,11 @@ class std_comms : public comms_iface { CUDA_CHECK(cudaMalloc(&_recvbuff, sizeof(int))); } - int getSize() const { return _size; } int getRank() const { return _rank; } - std::unique_ptr - commSplit(int color, int key) const { + std::unique_ptr commSplit(int color, int key) const { // Not supported by NCCL ASSERT(false, "ERROR: commSplit called but not yet supported in this comms " @@ -196,15 +184,13 @@ class std_comms : public comms_iface { CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); - allreduce(_sendbuff, _recvbuff, 1, INT, - SUM, _stream); + allreduce(_sendbuff, _recvbuff, 1, INT, SUM, _stream); ASSERT(syncStream(_stream) == status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank."); } void get_request_id(request_t *req) const { - request_t req_id; if (this->_free_requests.empty()) @@ -217,9 +203,8 @@ class std_comms : public comms_iface { *req = req_id; } - void isend(const void *buf, int size, int dest, - int tag, request_t *request) const { - + void isend(const void *buf, int size, int dest, int tag, + request_t *request) const { ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -235,7 +220,7 @@ class std_comms : public comms_iface { } void irecv(void *buf, int size, int source, int tag, - request_t *request) const { + request_t *request) const { ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -246,15 +231,13 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, tag_mask, - source); + _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, + tag_mask, source); _requests_in_flight.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, - request_t array_of_requests[]) const { - + void waitall(int count, request_t array_of_requests[]) const { ASSERT(_ucp_worker != nullptr, "ERROR: UCX comms not initialized on communicator."); @@ -324,65 +307,58 @@ class std_comms : public comms_iface { } } - void allreduce(const void *sendbuff, void *recvbuff, - int count, datatype_t datatype, - op_t op, cudaStream_t stream) const { - NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), _nccl_comm, stream)); + void allreduce(const void *sendbuff, void *recvbuff, int count, + datatype_t datatype, op_t op, cudaStream_t stream) const { + NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, + getNCCLDatatype(datatype), getNCCLOp(op), + _nccl_comm, stream)); } - void bcast(void *buff, int count, datatype_t datatype, - int root, cudaStream_t stream) const { + void bcast(void *buff, int count, datatype_t datatype, int root, + cudaStream_t stream) const { NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, _nccl_comm, stream)); } - void reduce(const void *sendbuff, void *recvbuff, - int count, datatype_t datatype, op_t op, - int root, cudaStream_t stream) const { + void reduce(const void *sendbuff, void *recvbuff, int count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), getNCCLOp(op), root, _nccl_comm, stream)); } - void allgather(const void *sendbuff, void *recvbuff, - int sendcount, datatype_t datatype, - cudaStream_t stream) const { + void allgather(const void *sendbuff, void *recvbuff, int sendcount, + datatype_t datatype, cudaStream_t stream) const { NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, getNCCLDatatype(datatype), _nccl_comm, stream)); } -// const void* sendbuf, void* recvbuf, -// const int recvcounts[], const int displs[], -// datatype_t datatype, cudaStream_t stream -// + // const void* sendbuf, void* recvbuf, + // const int recvcounts[], const int displs[], + // datatype_t datatype, cudaStream_t stream + // - void allgatherv(const void *sendbuf, void *recvbuf, - const int recvcounts[], - const int displs[], - datatype_t datatype, - cudaStream_t stream) const { + void allgatherv(const void *sendbuf, void *recvbuf, const int recvcounts[], + const int displs[], datatype_t datatype, + cudaStream_t stream) const { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. for (int root = 0; root < _size; ++root) { size_t dtype_size = getDatatypeSize(datatype); NCCL_CHECK(ncclBroadcast( - sendbuf, - static_cast(recvbuf) + displs[root] * dtype_size, + sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); } } - void reducescatter(const void *sendbuff, - void *recvbuff, int recvcount, - datatype_t datatype, op_t op, - cudaStream_t stream) const { + void reducescatter(const void *sendbuff, void *recvbuff, int recvcount, + datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, getNCCLDatatype(datatype), getNCCLOp(op), _nccl_comm, stream)); } - status_t syncStream( - cudaStream_t stream) const { + status_t syncStream(cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -426,12 +402,12 @@ class std_comms : public comms_iface { bool p2p_enabled = false; comms_ucp_handler _ucp_handler; ucp_worker_h _ucp_worker; - std::shared_ptr _ucp_eps; + std::shared_ptr _ucp_eps; mutable request_t _next_request_id; - mutable std::unordered_map + mutable std::unordered_map _requests_in_flight; mutable std::unordered_set _free_requests; }; -} +} // namespace comms } // end namespace raft diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index ee22b59101..6a52141545 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include #include From 46a27eb4fb7806d22ba543a73af9c1c93fdcb9f5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:09:38 -0400 Subject: [PATCH 012/189] Cleaning up --- cpp/include/raft/comms/comms.hpp | 8 ++++---- cpp/include/raft/comms/comms_helper.hpp | 4 ++-- cpp/include/raft/comms/nccl_helper.hpp | 6 ++++-- cpp/include/raft/comms/std_comms.hpp | 6 ------ python/raft/common/handle.pyx | 11 +---------- python/raft/dask/common/comms_utils.pyx | 11 ----------- 6 files changed, 11 insertions(+), 35 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index fafcce03e1..cd47322314 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -26,13 +26,13 @@ enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; enum op_t { SUM, PROD, MIN, MAX }; /** - * The resulting status of distributed stream synchronization - */ + * The resulting status of distributed stream synchronization + */ enum status_t { commStatusSuccess, // Synchronization successful commStatusError, // An error occured querying sync status - commStatusAbort -}; // A failure occurred in sync, queued operations aborted + commStatusAbort // A failure occurred in sync, queued operations aborted +}; class comms_iface { public: diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 33b45a761c..0d4cca48a4 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -95,7 +95,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { //post receives for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, + communicator.irecv(received_data.data() + request_idx, sizeof(int), r, 0, requests.data() + request_idx); ++request_idx; } @@ -103,7 +103,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); + communicator.isend(&rank, sizeof(int), r, 0, requests.data() + request_idx); ++request_idx; } } diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp index ea70896e32..5f367bafae 100644 --- a/cpp/include/raft/comms/nccl_helper.hpp +++ b/cpp/include/raft/comms/nccl_helper.hpp @@ -1,5 +1,6 @@ #include + namespace raft { namespace comms { inline void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { @@ -12,5 +13,6 @@ inline void get_unique_id(char *uid, int size) { memcpy(uid, id.internal, size); } -} // namespace comms -} // namespace raft +} +} + diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 5e2be6ce77..2644f46c7c 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -62,7 +62,6 @@ } while (0) namespace raft { - namespace comms { size_t getDatatypeSize(const datatype_t datatype) { @@ -333,11 +332,6 @@ class std_comms : public comms_iface { getNCCLDatatype(datatype), _nccl_comm, stream)); } - // const void* sendbuf, void* recvbuf, - // const int recvcounts[], const int displs[], - // datatype_t datatype, cudaStream_t stream - // - void allgatherv(const void *sendbuf, void *recvbuf, const int recvcounts[], const int displs[], datatype_t datatype, cudaStream_t stream) const { diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx index 75c10876ea..3f01e3e34a 100644 --- a/python/raft/common/handle.pyx +++ b/python/raft/common/handle.pyx @@ -23,11 +23,6 @@ import raft from libcpp.memory cimport shared_ptr from raft.common.cuda cimport _Stream, _Error, cudaStreamSynchronize -# -# cdef extern from ".cuml/common/rmmAllocatorAdapterhpp" namespace "ML" nogil: -# cdef cppclass rmmAllocatorAdapter(deviceAllocator): -# pass - cdef class Handle: """ Handle is a lightweight python wrapper around the corresponding C++ class @@ -54,7 +49,7 @@ cdef class Handle: del handle # optional! """ - # ML::cumlHandle doesn't have copy operator. So, use pointer for the object + # handle_t doesn't have copy operator. So, use pointer for the object # python world cannot access to this raw object directly, hence use # 'size_t'! cdef size_t h @@ -66,10 +61,6 @@ cdef class Handle: def __cinit__(self, n_streams=0): self.n_streams = n_streams self.h = (new handle_t(n_streams)) - # cdef shared_ptr[deviceAllocator] rmmAlloc = ( - # shared_ptr[deviceAllocator](new rmmAllocatorAdapter())) - # cdef cumlHandle* h_ = self.h - # h_.setDeviceAllocator(rmmAlloc) def __dealloc__(self): h_ = self.h diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 672c53aa28..be32be915a 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -43,17 +43,6 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": pass -cdef extern from "raft/comms/comms.hpp" namespace "raft::comms": - - cdef cppclass comms_t: - pass - - cdef cppclass comms_iface: - pass - - - - cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": void build_comms_nccl_ucx(handle_t *handle, From fc6ba2ebd2d6739c3a8ab8ef5e8dbed200ceeb1f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:13:53 -0400 Subject: [PATCH 013/189] Fixing python style --- python/raft/common/handle.pxd | 5 ++-- python/raft/dask/common/comms_utils.pyx | 35 +++++++++++-------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index ea40495a2a..fd3dc787ac 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -24,7 +24,8 @@ from libcpp.memory cimport shared_ptr cimport raft.common.cuda -cdef extern from "raft/mr/device/allocator.hpp" namespace "raft::mr::device" nogil: +cdef extern from "raft/mr/device/allocator.hpp" \ + namespace "raft::mr::device" nogil: cdef cppclass allocator: pass @@ -35,4 +36,4 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: void set_stream(raft.common.cuda._Stream s) except + void set_device_allocator(shared_ptr[allocator] a) except + raft.common.cuda._Stream get_stream() except + - int get_num_internal_streams() except + \ No newline at end of file + int get_num_internal_streams() except + diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index be32be915a..f5edf3158c 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -24,7 +24,6 @@ from cpython.long cimport PyLong_AsVoidPtr from libcpp cimport bool - from libc.stdint cimport uintptr_t cdef extern from "nccl.h": @@ -46,17 +45,16 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": void build_comms_nccl_ucx(handle_t *handle, - ncclComm_t comm, - void *ucp_worker, - void *eps, - int size, - int rank) except + + ncclComm_t comm, + void *ucp_worker, + void *eps, + int size, + int rank) except + void build_comms_nccl_only(handle_t *handle, - ncclComm_t comm, - int size, - int rank) except + - + ncclComm_t comm, + int size, + int rank) except + bool test_collective_allreduce(const handle_t &h) except + bool test_pointToPoint_simple_send_recv(const handle_t &h, @@ -81,7 +79,6 @@ def perform_test_comms_send_recv(handle, n_trials): return test_pointToPoint_simple_send_recv(deref(h), n_trials) - def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): """ Given a handle and initialized nccl comm, creates a cumlCommunicator @@ -99,9 +96,9 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): nccl_comm_ = nccl_comm_size_t build_comms_nccl_only(handle_, - deref(nccl_comm_), - size, - rank) + deref(nccl_comm_), + size, + rank) def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size, @@ -134,10 +131,10 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size, nccl_comm_ = nccl_comm_size_t build_comms_nccl_ucx(handle_, - deref(nccl_comm_), - ucp_worker_st, - ucp_eps, - size, - rank) + deref(nccl_comm_), + ucp_worker_st, + ucp_eps, + size, + rank) free(ucp_eps) From 842f5335592d78563df62da359f8718c56cda261 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:14:31 -0400 Subject: [PATCH 014/189] Fixing cpp style --- cpp/include/raft/comms/comms.hpp | 2 +- cpp/include/raft/comms/comms_helper.hpp | 7 ++++--- cpp/include/raft/comms/nccl_helper.hpp | 6 ++---- cpp/include/raft/cudart_utils.h | 16 ++++++++-------- cpp/include/raft/handle.hpp | 10 +++------- 5 files changed, 18 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index cd47322314..3ef25a9285 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -31,7 +31,7 @@ enum op_t { SUM, PROD, MIN, MAX }; enum status_t { commStatusSuccess, // Synchronization successful commStatusError, // An error occured querying sync status - commStatusAbort // A failure occurred in sync, queued operations aborted + commStatusAbort // A failure occurred in sync, queued operations aborted }; class comms_iface { diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 0d4cca48a4..a1236233cc 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -95,15 +95,16 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { //post receives for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, sizeof(int), r, 0, - requests.data() + request_idx); + communicator.irecv(received_data.data() + request_idx, sizeof(int), r, + 0, requests.data() + request_idx); ++request_idx; } } for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.isend(&rank, sizeof(int), r, 0, requests.data() + request_idx); + communicator.isend(&rank, sizeof(int), r, 0, + requests.data() + request_idx); ++request_idx; } } diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp index 5f367bafae..ea70896e32 100644 --- a/cpp/include/raft/comms/nccl_helper.hpp +++ b/cpp/include/raft/comms/nccl_helper.hpp @@ -1,6 +1,5 @@ #include - namespace raft { namespace comms { inline void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { @@ -13,6 +12,5 @@ inline void get_unique_id(char *uid, int size) { memcpy(uid, id.internal, size); } -} -} - +} // namespace comms +} // namespace raft diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 93543d09de..47e76ab916 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -112,14 +112,14 @@ class exception : public std::exception { // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ - #define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t status = call; \ - if (status != cudaSuccess) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ - #call, __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ - } while (0) +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t status = call; \ + if (status != cudaSuccess) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ + __FILE__, __LINE__, cudaGetErrorString(status)); \ + } \ + } while (0) /** helper method to get max usable shared mem per block parameter */ inline int get_shared_memory_per_block() { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 055fdf9e3f..f8db324d90 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -149,8 +149,7 @@ class handle_t { } } - void set_comms( - std::shared_ptr communicator) { + void set_comms(std::shared_ptr communicator) { _communicator = communicator; } @@ -160,10 +159,7 @@ class handle_t { return *_communicator; } - bool comms_initialized() const { - return (nullptr != _communicator.get()); - } - + bool comms_initialized() const { return (nullptr != _communicator.get()); } const cudaDeviceProp& get_device_properties() const { std::lock_guard _(mutex_); @@ -176,7 +172,7 @@ class handle_t { private: std::shared_ptr _communicator; - + const int dev_id_; const int num_streams_; std::vector streams_; From 43f0d69f5275dd81d5f1794dcb5917133e1770cc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:20:56 -0400 Subject: [PATCH 015/189] Adding copyright headers --- python/raft.egg-info/PKG-INFO | 14 ++++++++++++++ python/raft.egg-info/SOURCES.txt | 13 +++++++++++++ python/raft.egg-info/dependency_links.txt | 1 + python/raft.egg-info/not-zip-safe | 1 + python/raft.egg-info/requires.txt | 1 + python/raft.egg-info/top_level.txt | 1 + python/raft/__init__.py | 15 +++++++++++++++ python/raft/common/__init__.py | 14 ++++++++++++++ python/raft/dask/__init__.py | 14 ++++++++++++++ python/raft/dask/common/__init__.py | 22 ++++++++++++++++++---- 10 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 python/raft.egg-info/PKG-INFO create mode 100644 python/raft.egg-info/SOURCES.txt create mode 100644 python/raft.egg-info/dependency_links.txt create mode 100644 python/raft.egg-info/not-zip-safe create mode 100644 python/raft.egg-info/requires.txt create mode 100644 python/raft.egg-info/top_level.txt diff --git a/python/raft.egg-info/PKG-INFO b/python/raft.egg-info/PKG-INFO new file mode 100644 index 0000000000..6b926d9fe9 --- /dev/null +++ b/python/raft.egg-info/PKG-INFO @@ -0,0 +1,14 @@ +Metadata-Version: 1.1 +Name: raft +Version: 0+untagged.85.g5d37e9e.dirty +Summary: RAPIDS Analytics Frameworks Toolset +Home-page: UNKNOWN +Author: NVIDIA Corporation +Author-email: UNKNOWN +License: Apache +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Intended Audience :: Developers +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 diff --git a/python/raft.egg-info/SOURCES.txt b/python/raft.egg-info/SOURCES.txt new file mode 100644 index 0000000000..18f16faa2b --- /dev/null +++ b/python/raft.egg-info/SOURCES.txt @@ -0,0 +1,13 @@ +setup.cfg +setup.py +raft.egg-info/PKG-INFO +raft.egg-info/SOURCES.txt +raft.egg-info/dependency_links.txt +raft.egg-info/not-zip-safe +raft.egg-info/requires.txt +raft.egg-info/top_level.txt +raft/common/cuda.cpp +raft/common/handle.cpp +raft/dask/common/comms_utils.cpp +raft/dask/common/nccl.cpp +raft/include_test/raft_include_test.cpp \ No newline at end of file diff --git a/python/raft.egg-info/dependency_links.txt b/python/raft.egg-info/dependency_links.txt new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/python/raft.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/python/raft.egg-info/not-zip-safe b/python/raft.egg-info/not-zip-safe new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/python/raft.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/python/raft.egg-info/requires.txt b/python/raft.egg-info/requires.txt new file mode 100644 index 0000000000..f6629e0245 --- /dev/null +++ b/python/raft.egg-info/requires.txt @@ -0,0 +1 @@ +cython diff --git a/python/raft.egg-info/top_level.txt b/python/raft.egg-info/top_level.txt new file mode 100644 index 0000000000..72e8ffc0db --- /dev/null +++ b/python/raft.egg-info/top_level.txt @@ -0,0 +1 @@ +* diff --git a/python/raft/__init__.py b/python/raft/__init__.py index ff4479b422..b2431b4f6c 100644 --- a/python/raft/__init__.py +++ b/python/raft/__init__.py @@ -1 +1,16 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from .include_test import raft_include_test diff --git a/python/raft/common/__init__.py b/python/raft/common/__init__.py index e69de29bb2..df8a4ae3b9 100644 --- a/python/raft/common/__init__.py +++ b/python/raft/common/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file diff --git a/python/raft/dask/__init__.py b/python/raft/dask/__init__.py index e69de29bb2..df8a4ae3b9 100644 --- a/python/raft/dask/__init__.py +++ b/python/raft/dask/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index b8d7b28313..2ab93d9996 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -1,8 +1,22 @@ - +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# from raft.dask.common.comms import CommsContext, worker_state -from raft.dask.common.comms_utils import inject_comms_on_handle, \ - perform_test_comms_allreduce, perform_test_comms_send_recv, \ - inject_comms_on_handle_coll_only +from raft.dask.common.comms_utils import inject_comms_on_handle +from raft.dask.common.comms_utils import inject_comms_on_handle_coll_only +from raft.dask.common.comms_utils import perform_test_comms_allreduce +from raft.dask.common.comms_utils import perform_test_comms_send_recv From ac6e69920149374441811d35a7e115729117b7d8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:21:31 -0400 Subject: [PATCH 016/189] Adding init py for tests --- python/raft/test/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/raft/test/__init__.py b/python/raft/test/__init__.py index e69de29bb2..df8a4ae3b9 100644 --- a/python/raft/test/__init__.py +++ b/python/raft/test/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file From c442d877558de56ef0099bf71c791f70a5f212ba Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:24:52 -0400 Subject: [PATCH 017/189] Adding license headers and consistent namespacing --- cpp/include/raft/comms/comms.hpp | 2 +- cpp/include/raft/comms/comms_helper.hpp | 4 +++- cpp/include/raft/comms/nccl_helper.hpp | 18 ++++++++++++++++++ cpp/include/raft/comms/std_comms.hpp | 5 ++--- cpp/include/raft/comms/ucp_helper.hpp | 9 +++++++-- 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 3ef25a9285..3a61207214 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index a1236233cc..2e71009eaf 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp index ea70896e32..d7a14ba8ba 100644 --- a/cpp/include/raft/comms/nccl_helper.hpp +++ b/cpp/include/raft/comms/nccl_helper.hpp @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + #include namespace raft { diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 2644f46c7c..2af64a262b 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -402,6 +402,5 @@ class std_comms : public comms_iface { _requests_in_flight; mutable std::unordered_set _free_requests; }; -} // namespace comms - +} // end namespace comms } // end namespace raft diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 6a52141545..e092509fe2 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,16 @@ * limitations under the License. */ +#pragma once + #include #include #include #include #include -#pragma once +namespace raft { +namespace comms { typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); typedef void (*dlsym_rec_free)(void *); typedef int (*dlsym_worker_progress)(ucp_worker_h); @@ -223,3 +226,5 @@ class comms_ucp_handler { UCS_PTR_STATUS(recv_result)); } }; +} // end namespace comms +} // end namespace raft From 32e63c159b1c7f12560b1c32ec317a6739ac7544 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 15:26:46 -0400 Subject: [PATCH 018/189] More cleanup --- cpp/include/raft/comms/ucp_helper.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index e092509fe2..06f687752f 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -24,6 +24,7 @@ namespace raft { namespace comms { + typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); typedef void (*dlsym_rec_free)(void *); typedef int (*dlsym_worker_progress)(ucp_worker_h); From d377347c6a1fa83d8b9794b5b5b62b9f5341574b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:11:20 -0400 Subject: [PATCH 019/189] Cleaning up raft.dask.common.Comms --- python/raft/dask/common/__init__.py | 3 +- python/raft/dask/common/comms.py | 349 +++++++++++++++++----------- python/raft/dask/common/utils.py | 3 +- python/raft/test/test_comms.py | 55 +++-- 4 files changed, 261 insertions(+), 149 deletions(-) diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index 2ab93d9996..9b85a32800 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -13,7 +13,8 @@ # limitations under the License. # -from raft.dask.common.comms import CommsContext, worker_state +from raft.dask.common.comms import Comms +from raft.dask.common.comms import local_handle from raft.dask.common.comms_utils import inject_comms_on_handle from raft.dask.common.comms_utils import inject_comms_on_handle_coll_only diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py index 54e15c54c2..4ac47e25a6 100644 --- a/python/raft/dask/common/comms.py +++ b/python/raft/dask/common/comms.py @@ -30,25 +30,204 @@ import uuid +class Comms: + + """ + Initializes and manages underlying NCCL and UCX comms handles across + the workers of a Dask cluster. It is expected that `init()` will be + called explicitly. It is recommended to also call `destroy()` when + the comms are no longer needed so the underlying resources can be + cleaned up. This class is not meant to be thread-safe. + + Examples + -------- + .. code-block:: python + + # The following code block assumes we have wrapped a C++ + # function in a Python function called `run_algorithm`, + # which takes a `raft::handle_t` as a single argument. + # Once the `Comms` instance is successfully initialized, + # the underlying `raft::handle_t` will contain an instance + # of `raft::comms::comms_t` + + from dask_cuda import LocalCUDACluster + from dask.distributed import Client + + from raft.dask.common import Comms, local_handle + + cluster = LocalCUDACluster() + client = Client(cluster) + + def _use_comms(sessionId): + return run_algorithm(local_handle(sessionId)) + + comms = Comms(client=client) + comms.init() + + futures = [client.submit(_use_comms, + comms.sessionId, + workers=[w], + pure=False) # Don't memoize + for w in cb.worker_addresses] + wait(dfs, timeout=5) + + comms.destroy() + client.close() + cluster.close() + """ + + def __init__(self, comms_p2p=False, client=None, verbose=False, + streams_per_handle=0): + """ + Construct a new CommsContext instance + + Parameters + ---------- + comms_p2p : bool + Initialize UCX endpoints? + client : dask.distributed.Client [optional] + Dask client to use + verbose : bool + Print verbose logging + """ + self.client = client if client is not None else default_client() + self.comms_p2p = comms_p2p + + self.streams_per_handle = streams_per_handle + + self.sessionId = uuid.uuid4().bytes + + self.nccl_initialized = False + self.ucx_initialized = False + + self.verbose = verbose + + if verbose: + print("Initializing comms!") + + def __del__(self): + if self.nccl_initialized or self.ucx_initialized: + self.destroy() + + def worker_info(self, workers): + """ + Builds a dictionary of { (worker_address, worker_port) : + (worker_rank, worker_port ) } + """ + ranks = _func_worker_ranks(workers) + ports = _func_ucp_ports(self.client, workers) \ + if self.comms_p2p else None + + output = {} + for k in ranks.keys(): + output[k] = {"rank": ranks[k]} + if self.comms_p2p: + output[k]["port"] = ports[k] + return output + + def init(self, workers=None): + """ + Initializes the underlying comms. NCCL is required but + UCX is only initialized if `comms_p2p == True` + + Parameters + ---------- + + workers : Sequence + Unique collection of workers for initializing comms. + """ + + self.worker_addresses = list(set( + self.client.scheduler_info()["workers"].keys() + if workers is None else workers)) + + if self.nccl_initialized or self.ucx_initialized: + warnings.warn("Comms have already been initialized.") + return + + worker_info = self.worker_info(self.worker_addresses) + worker_info = {w: worker_info[w] for w in self.worker_addresses} + + self.uniqueId = nccl.get_unique_id() + + self.client.run(_func_init_all, + self.sessionId, + self.uniqueId, + self.comms_p2p, + worker_info, + self.verbose, + self.streams_per_handle, + workers=self.worker_addresses, + wait=True) + + self.nccl_initialized = True + + if self.comms_p2p: + self.ucx_initialized = True + + if self.verbose: + print("Initialization complete.") + + def destroy(self): + """ + Shuts down initialized comms and cleans up resources. This will + be called automatically by the Comms destructor, but may be called + earlier to save resources. + """ + self.client.run(_func_destroy_all, + self.sessionId, + self.comms_p2p, + self.verbose, + wait=True, + workers=self.worker_addresses) + + if self.verbose: + print("Destroying comms.") + + self.nccl_initialized = False + self.ucx_initialized = False + + +def local_handle(sessionId): + """Simple helper function for retrieving the local handle_t instance + for a comms session on a worker. + + Parameters + ---------- + sessionId : str + session identifier from an initialized comms instance + + Returns + ------- + + handle : raft.Handle or None + """ + state = worker_state(sessionId) + return state["handle"] if "handle" in state else None + + def worker_state(sessionId=None): """ Retrieves cuML comms state on local worker for the given sessionId, creating a new session if it does not exist. If no session id is given, returns the state dict for all sessions. - :param sessionId: - :return: + + Parameters + ---------- + sessionId : str + session identifier from initialized comms instance """ worker = get_worker() - if not hasattr(worker, "_cuml_comm_state"): - worker._cuml_comm_state = {} - if sessionId is not None and sessionId not in worker._cuml_comm_state: + if not hasattr(worker, "_raft_comm_state"): + worker._raft_comm_state = {} + if sessionId is not None and sessionId not in worker._raft_comm_state: # Build state for new session and mark session creation time - worker._cuml_comm_state[sessionId] = {"ts": time.time()} + worker._raft_comm_state[sessionId] = {"ts": time.time()} if sessionId is not None: - return worker._cuml_comm_state[sessionId] - return worker._cuml_comm_state + return worker._raft_comm_state[sessionId] + return worker._raft_comm_state def get_ucx(): @@ -109,10 +288,14 @@ async def _func_init_all(sessionId, uniqueId, comms_p2p, def _func_init_nccl(sessionId, uniqueId): """ Initialize ncclComm_t on worker - :param workerId: int ID of the current worker running the function - :param nWorkers: int Number of workers in the cluster - :param uniqueId: array[byte] The NCCL unique Id generated from the - client. + + Parameters + ---------- + sessionId : str + session identifier from a comms instance + uniqueId : array[byte] + The NCCL unique Id generated from the + client. """ wid = worker_state(sessionId)["wid"] @@ -128,12 +311,13 @@ def _func_init_nccl(sessionId, uniqueId): def _func_build_handle_p2p(sessionId, streams_per_handle, verbose): """ - Builds a cumlHandle on the current worker given the initialized comms - :param nccl_comm: ncclComm_t Initialized NCCL comm - :param eps: size_t initialized endpoints - :param nWorkers: int number of workers in cluster - :param workerId: int Rank of current worker - :return: + Builds a handle_t on the current worker given the initialized comms + + Parameters + ---------- + sessionId : str id to reference state for current comms instance. + streams_per_handle : int number of internal streams to create + verbose : bool print verbose logging output """ ucp_worker = get_ucx().get_worker() session_state = worker_state(sessionId) @@ -152,11 +336,13 @@ def _func_build_handle_p2p(sessionId, streams_per_handle, verbose): def _func_build_handle(sessionId, streams_per_handle, verbose): """ - Builds a cumlHandle on the current worker given the initialized comms - :param nccl_comm: ncclComm_t Initialized NCCL comm - :param nWorkers: int number of workers in cluster - :param workerId: int Rank of current worker - :return: + Builds a handle_t on the current worker given the initialized comms + + Parameters + ---------- + sessionId : str id to reference state for current comms instance. + streams_per_handle : int number of internal streams to create + verbose : bool print verbose logging output """ handle = Handle(streams_per_handle) @@ -181,9 +367,13 @@ def _func_store_initial_state(nworkers, sessionId, uniqueId, wid): async def _func_ucp_create_endpoints(sessionId, worker_info): """ Runs on each worker to create ucp endpoints to all other workers - :param sessionId: uuid unique id for this instance - :param worker_info: dict Maps worker address to rank & UCX port - :param r: float a random number to stop the function from being cached + + Parameters + ---------- + sessionId : str + uuid unique id for this instance + worker_info : dict + Maps worker addresses to NCCL ranks & UCX ports """ dask_worker = get_worker() local_address = dask_worker.address @@ -220,110 +410,3 @@ def _func_worker_ranks(workers): Builds a dictionary of { (worker_address, worker_port) : worker_rank } """ return dict(list(zip(workers, range(len(workers))))) - - -class CommsContext: - - """ - A base class to initialize and manage underlying NCCL and UCX - comms handles across a Dask cluster. Classes extending CommsContext - are responsible for calling `self.init()` to initialize the comms. - Classes that extend or use the CommsContext are also responsible for - calling `destroy()` to clean up the underlying comms. - - This class is not meant to be thread-safe. - """ - - def __init__(self, comms_p2p=False, client=None, verbose=False, - streams_per_handle=0): - """ - Construct a new CommsContext instance - :param comms_p2p: bool Should p2p comms be initialized? - """ - self.client = client if client is not None else default_client() - self.comms_p2p = comms_p2p - - self.streams_per_handle = streams_per_handle - - self.sessionId = uuid.uuid4().bytes - - self.nccl_initialized = False - self.ucx_initialized = False - - self.verbose = verbose - - if verbose: - print("Initializing comms!") - - def __del__(self): - if self.nccl_initialized or self.ucx_initialized: - self.destroy() - - def worker_info(self, workers): - """ - Builds a dictionary of { (worker_address, worker_port) : - (worker_rank, worker_port ) } - """ - ranks = _func_worker_ranks(workers) - ports = _func_ucp_ports(self.client, workers) \ - if self.comms_p2p else None - - output = {} - for k in ranks.keys(): - output[k] = {"rank": ranks[k]} - if self.comms_p2p: - output[k]["port"] = ports[k] - return output - - def init(self, workers=None): - """ - Initializes the underlying comms. NCCL is required but - UCX is only initialized if `comms_p2p == True` - """ - - self.worker_addresses = list(set((self.client.has_what().keys() - if workers is None else workers))) - - if self.nccl_initialized: - warnings.warn("CommsContext has already been initialized.") - return - - worker_info = self.worker_info(self.worker_addresses) - worker_info = {w: worker_info[w] for w in self.worker_addresses} - - self.uniqueId = nccl.get_unique_id() - - self.client.run(_func_init_all, - self.sessionId, - self.uniqueId, - self.comms_p2p, - worker_info, - self.verbose, - self.streams_per_handle, - workers=self.worker_addresses, - wait=True) - - self.nccl_initialized = True - - if self.comms_p2p: - self.ucx_initialized = True - - if self.verbose: - print("Initialization complete.") - - def destroy(self): - """ - Shuts down initialized comms and cleans up resources. - """ - self.client.run(_func_destroy_all, - self.sessionId, - self.comms_p2p, - self.verbose, - wait=True, - workers=self.worker_addresses) - - if self.verbose: - print("Destroying comms.") - - self.nccl_initialized = False - self.ucx_initialized = False diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py index ff5a0b4538..8bb17c5ed8 100644 --- a/python/raft/dask/common/utils.py +++ b/python/raft/dask/common/utils.py @@ -17,7 +17,8 @@ import random import time -from dask.distributed import default_client, wait +from dask.distributed import default_client +from dask.distributed import wait from asyncio import InvalidStateError diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index 226a471f3b..c95bb86b65 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -15,11 +15,11 @@ import pytest -import random +from dask.distributed import Client +from dask.distributed import wait -from dask.distributed import Client, wait - -from raft.dask.common import CommsContext, worker_state +from raft.dask.common import Comms +from raft.dask.common import local_handle from raft.dask.common import perform_test_comms_send_recv from raft.dask.common import perform_test_comms_allreduce @@ -31,7 +31,7 @@ def test_comms_init_no_p2p(cluster): client = Client(cluster) try: - cb = CommsContext(comms_p2p=False) + cb = Comms() cb.init() assert cb.nccl_initialized is True @@ -43,27 +43,54 @@ def test_comms_init_no_p2p(cluster): client.close() -def func_test_allreduce(sessionId, r): - handle = worker_state(sessionId)["handle"] +def func_test_allreduce(sessionId): + handle = local_handle(sessionId) return perform_test_comms_allreduce(handle) -def func_test_send_recv(sessionId, n_trials, r): - handle = worker_state(sessionId)["handle"] +def func_test_send_recv(sessionId, n_trials): + handle = local_handle(sessionId) return perform_test_comms_send_recv(handle, n_trials) +def test_handles(cluster): + + client = Client(cluster) + + def _has_handle(sessionId): + return local_handle(sessionId) is not None + + try: + cb = Comms() + cb.init() + + dfs = [client.submit(_has_handle, + cb.sessionId, + pure=False, + workers=[w]) + for w in cb.worker_addresses] + wait(dfs, timeout=5) + + assert all(client.compute(dfs, sync=True)) + + finally: + cb.destroy() + client.close() + + @pytest.mark.nccl def test_allreduce(cluster): client = Client(cluster) try: - cb = CommsContext() + cb = Comms() cb.init() - dfs = [client.submit(func_test_allreduce, cb.sessionId, - random.random(), workers=[w]) + dfs = [client.submit(func_test_allreduce, + cb.sessionId, + pure=False, + workers=[w]) for w in cb.worker_addresses] wait(dfs, timeout=5) @@ -82,13 +109,13 @@ def test_send_recv(n_trials, ucx_cluster): try: - cb = CommsContext(comms_p2p=True, verbose=True) + cb = Comms(comms_p2p=True, verbose=True) cb.init() dfs = [client.submit(func_test_send_recv, cb.sessionId, n_trials, - random.random(), + pure=False, workers=[w]) for w in cb.worker_addresses] From e1b4ea73b8fde13ca7edb7ed10dc4dff7393ff07 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:13:59 -0400 Subject: [PATCH 020/189] Ignoring raft egg artifacts --- .gitignore | 1 + python/raft.egg-info/PKG-INFO | 14 -------------- python/raft.egg-info/SOURCES.txt | 13 ------------- python/raft.egg-info/dependency_links.txt | 1 - python/raft.egg-info/not-zip-safe | 1 - python/raft.egg-info/requires.txt | 1 - python/raft.egg-info/top_level.txt | 1 - 7 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 python/raft.egg-info/PKG-INFO delete mode 100644 python/raft.egg-info/SOURCES.txt delete mode 100644 python/raft.egg-info/dependency_links.txt delete mode 100644 python/raft.egg-info/not-zip-safe delete mode 100644 python/raft.egg-info/requires.txt delete mode 100644 python/raft.egg-info/top_level.txt diff --git a/.gitignore b/.gitignore index ecc92d8dce..60a43f6b54 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ log .ipynb_checkpoints .DS_Store dask-worker-space/ +*.egg-info/ ## eclipse .project .cproject diff --git a/python/raft.egg-info/PKG-INFO b/python/raft.egg-info/PKG-INFO deleted file mode 100644 index 6b926d9fe9..0000000000 --- a/python/raft.egg-info/PKG-INFO +++ /dev/null @@ -1,14 +0,0 @@ -Metadata-Version: 1.1 -Name: raft -Version: 0+untagged.85.g5d37e9e.dirty -Summary: RAPIDS Analytics Frameworks Toolset -Home-page: UNKNOWN -Author: NVIDIA Corporation -Author-email: UNKNOWN -License: Apache -Description: UNKNOWN -Platform: UNKNOWN -Classifier: Intended Audience :: Developers -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: 3.7 diff --git a/python/raft.egg-info/SOURCES.txt b/python/raft.egg-info/SOURCES.txt deleted file mode 100644 index 18f16faa2b..0000000000 --- a/python/raft.egg-info/SOURCES.txt +++ /dev/null @@ -1,13 +0,0 @@ -setup.cfg -setup.py -raft.egg-info/PKG-INFO -raft.egg-info/SOURCES.txt -raft.egg-info/dependency_links.txt -raft.egg-info/not-zip-safe -raft.egg-info/requires.txt -raft.egg-info/top_level.txt -raft/common/cuda.cpp -raft/common/handle.cpp -raft/dask/common/comms_utils.cpp -raft/dask/common/nccl.cpp -raft/include_test/raft_include_test.cpp \ No newline at end of file diff --git a/python/raft.egg-info/dependency_links.txt b/python/raft.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789179..0000000000 --- a/python/raft.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/python/raft.egg-info/not-zip-safe b/python/raft.egg-info/not-zip-safe deleted file mode 100644 index 8b13789179..0000000000 --- a/python/raft.egg-info/not-zip-safe +++ /dev/null @@ -1 +0,0 @@ - diff --git a/python/raft.egg-info/requires.txt b/python/raft.egg-info/requires.txt deleted file mode 100644 index f6629e0245..0000000000 --- a/python/raft.egg-info/requires.txt +++ /dev/null @@ -1 +0,0 @@ -cython diff --git a/python/raft.egg-info/top_level.txt b/python/raft.egg-info/top_level.txt deleted file mode 100644 index 72e8ffc0db..0000000000 --- a/python/raft.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -* From 736696b602e0cf864ae4108de8461c7b0b2bc1d7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:20:20 -0400 Subject: [PATCH 021/189] Cleaning up raft dask utils --- python/raft/dask/common/utils.py | 181 ++----------------------------- 1 file changed, 8 insertions(+), 173 deletions(-) diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py index 8bb17c5ed8..c92fbf3e47 100644 --- a/python/raft/dask/common/utils.py +++ b/python/raft/dask/common/utils.py @@ -13,25 +13,7 @@ # limitations under the License. # -import os -import random -import time - from dask.distributed import default_client -from dask.distributed import wait - -from asyncio import InvalidStateError - -from threading import Lock - - -def get_visible_devices(): - """ - Return a list of the CUDA_VISIBLE_DEVICES - :return: list[int] visible devices - """ - # TODO: Shouldn't have to split on every call - return os.environ["CUDA_VISIBLE_DEVICES"].split(",") def get_client(client=None): @@ -41,164 +23,17 @@ def get_client(client=None): def parse_host_port(address): """ Given a string address with host/port, build a tuple(host, port) - :param address: string address to parse - :return: tuple(host, port) + + Parameters + ---------- + address: string address to parse + + Returns + ------- + tuple with host and port info : tuple(host, port) """ if '://' in address: address = address.rsplit('://', 1)[1] host, port = address.split(':') port = int(port) return host, port - - -def build_host_dict(workers): - """ - Builds a dict to map the set of ports running on each host to - the hostname. - :param workers: list(tuple(host, port)) list of worker addresses - :return: dict(host, set(port)) - """ - hosts = set(map(lambda x: parse_host_port(x), workers)) - hosts_dict = {} - for host, port in hosts: - if host not in hosts_dict: - hosts_dict[host] = set([port]) - else: - hosts_dict[host].add(port) - - return hosts_dict - - -def persist_across_workers(client, objects, workers=None): - """ - Calls persist on the 'objects' ensuring they are spread - across the workers on 'workers'. - - Parameters - ---------- - client : dask.distributed.Client - objects : list - Dask distributed objects to be persisted - workers : list or None - List of workers across which to persist objects - If None, then all workers attached to 'client' will be used - """ - if workers is None: - workers = client.has_what().keys() # Default to all workers - return client.persist(objects, workers={o: workers for o in objects}) - - -def raise_exception_from_futures(futures): - """Raises a RuntimeError if any of the futures indicates an exception""" - errs = [f.exception() for f in futures if f.exception()] - if errs: - raise RuntimeError("%d of %d worker jobs failed: %s" % ( - len(errs), len(futures), ", ".join(map(str, errs)) - )) - - -def wait_and_raise_from_futures(futures): - """ - Returns the collected futures after all the futures - have finished and do not indicate any exceptions. - """ - wait(futures) - raise_exception_from_futures(futures) - return futures - - -def raise_mg_import_exception(): - raise Exception("cuML has not been built with multiGPU support " - "enabled. Build with the --multigpu flag to" - " enable multiGPU support.") - - -class MultiHolderLock: - """ - A per-process synchronization lock allowing multiple concurrent holders - at any one time. This is used in situations where resources might be - limited and it's important that the number of concurrent users of - the resources are constained. - - This lock is serializable, but relies on a Python threading.Lock - underneath to properly synchronize internal state across threads. - Note that this lock is only intended to be used per-process and - the underlying threading.Lock will not be serialized. - """ - - def __init__(self, n): - """ - Initialize the lock - :param n : integer the maximum number of concurrent holders - """ - self.n = n - self.current_tasks = 0 - self.lock = Lock() - - def _acquire(self, blocking=True, timeout=10): - lock_acquired = False - - inner_lock_acquired = self.lock.acquire(blocking, timeout) - - if inner_lock_acquired and self.current_tasks < self.n - 1: - self.current_tasks += 1 - lock_acquired = True - self.lock.release() - - return lock_acquired - - def acquire(self, blocking=True, timeout=10): - """ - Acquire the lock. - :param blocking : bool will block if True - :param timeout : a timeout (in seconds) to wait for the lock - before failing. - :return : True if lock was acquired successfully, False otherwise - """ - - t = time.time() - - lock_acquired = self._acquire(blocking, timeout) - - while blocking and not lock_acquired: - - if time.time() - t > timeout: - raise TimeoutError() - - lock_acquired = self.acquire(blocking, timeout) - time.sleep(random.uniform(0, 0.01)) - - return lock_acquired - - def __getstate__(self): - d = self.__dict__.copy() - if "lock" in d: - del d["lock"] - return d - - def __setstate__(self, d): - d["lock"] = Lock() - self.__dict__ = d - - def release(self, blocking=True, timeout=10): - """ - Release a hold on the lock to allow another holder. Note that - while Python's threading.Lock does not have options for blocking - or timeout in release(), this lock uses a threading.Lock - internally and so will need to acquire that lock in order - to properly synchronize the underlying state. - :param blocking : bool will bock if True - :param timeout : a timeout (in seconds) to wait for the lock - before failing. - :return : True if lock was released successfully, False otherwise. - """ - - if self.current_tasks == 0: - raise InvalidStateError("Cannot release lock when no " - "concurrent tasks are executing") - - lock_acquired = self.lock.acquire(blocking, timeout) - if lock_acquired: - self.current_tasks -= 1 - self.lock.release() - return lock_acquired From 55d9dfdc191bb923cce9c1e4f44aa60dce85c5dc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:35:24 -0400 Subject: [PATCH 022/189] More cleanup, copyright headers, and docs --- python/raft/common/__init__.py | 5 ++- python/raft/common/cuda.pyx | 6 ++-- python/raft/common/handle.pxd | 2 +- python/raft/common/handle.pyx | 23 ++++++------ python/raft/dask/__init__.py | 4 ++- python/raft/dask/common/comms_utils.pyx | 48 ++++++++++++++++++------- python/raft/dask/common/nccl.pyx | 36 ++++++++++++++----- python/raft/dask/common/ucx.py | 2 +- python/raft/dask/common/utils.py | 2 +- 9 files changed, 86 insertions(+), 42 deletions(-) diff --git a/python/raft/common/__init__.py b/python/raft/common/__init__.py index df8a4ae3b9..ac84a7a93a 100644 --- a/python/raft/common/__init__.py +++ b/python/raft/common/__init__.py @@ -11,4 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# \ No newline at end of file +# + +from raft.common.cuda import Stream +from raft.common.handle import Handle \ No newline at end of file diff --git a/python/raft/common/cuda.pyx b/python/raft/common/cuda.pyx index 09f347058f..baa46bfef8 100644 --- a/python/raft/common/cuda.pyx +++ b/python/raft/common/cuda.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,8 +43,8 @@ cdef class Stream: .. code-block:: python - import cuml - stream = cuml.cuda.Stream() + from raft.common.cuda import Stream + stream = Stream() stream.sync() del stream # optional! """ diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index fd3dc787ac..dfdcdb929b 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx index 3f01e3e34a..3659cb3669 100644 --- a/python/raft/common/handle.pyx +++ b/python/raft/common/handle.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,25 +26,24 @@ from raft.common.cuda cimport _Stream, _Error, cudaStreamSynchronize cdef class Handle: """ Handle is a lightweight python wrapper around the corresponding C++ class - of cumlHandle exposed by cuML's C++ interface. Refer to the header file - cuml/cuml.hpp for interface level details of this struct + of handle_t exposed by RAFT's C++ interface. Refer to the header file + raft/handle.hpp for interface level details of this struct Examples -------- .. code-block:: python - import cuml - stream = cuml.cuda.Stream() - handle = cuml.Handle() + from raft.common import Stream, Handle + stream = Stream() + handle = Handle() handle.setStream(stream) - handle.enableRMM() # Enable RMM as the device-side allocator - # call ML algos here + # call algos here # final sync of all work launched in the stream of this handle - # this is same as `cuml.cuda.Stream.sync()` call, but safer in case - # the default stream inside the `cumlHandle` is being used + # this is same as `raft.cuda.Stream.sync()` call, but safer in case + # the default stream inside the `handle_t` is being used handle.sync() del handle # optional! """ @@ -75,8 +74,8 @@ cdef class Handle: """ Issues a sync on the stream set for this handle. - Once we make `cuml.cuda.Stream` as a mandatory option for creating - `cuml.Handle`, this should go away + Once we make `raft.common.cuda.Stream` as a mandatory option for creating + `raft.common.Handle`, this should go away """ cdef handle_t* h_ = self.h cdef _Stream stream = h_.get_stream() diff --git a/python/raft/dask/__init__.py b/python/raft/dask/__init__.py index df8a4ae3b9..e58ac25f47 100644 --- a/python/raft/dask/__init__.py +++ b/python/raft/dask/__init__.py @@ -11,4 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# \ No newline at end of file +# + +from raft.dask.common.comms import Comms \ No newline at end of file diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index f5edf3158c..90fd97671f 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -41,7 +41,6 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": cdef cppclass std_comms: pass - cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": void build_comms_nccl_ucx(handle_t *handle, @@ -64,7 +63,11 @@ cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": def perform_test_comms_allreduce(handle): """ Performs an allreduce on the current worker - :param handle: Handle handle containing cumlCommunicator to use + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use """ cdef const handle_t* h = handle.getHandle() return test_collective_allreduce(deref(h)) @@ -73,7 +76,11 @@ def perform_test_comms_allreduce(handle): def perform_test_comms_send_recv(handle, n_trials): """ Performs a p2p send/recv on the current worker - :param handle: Handle handle containing cumlCommunicator to use + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use """ cdef const handle_t *h = handle.getHandle() return test_pointToPoint_simple_send_recv(deref(h), n_trials) @@ -83,10 +90,18 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): """ Given a handle and initialized nccl comm, creates a cumlCommunicator instance and injects it into the handle. - :param handle: Handle cumlHandle to inject comms into - :param nccl_inst: ncclComm_t initialized nccl comm - :param size: int number of workers in cluster - :param rank: int rank of current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + nccl_inst : raft.dask.common.nccl + Initialized nccl comm to use + size : int + Number of workers in cluster + rank : int + Rank of current worker + """ cdef size_t handle_size_t = handle.getHandle() @@ -106,12 +121,19 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size, """ Given a handle and initialized comms, creates a cumlCommunicator instance and injects it into the handle. - :param handle: Handle cumlHandle to inject comms into - :param nccl_inst: ncclComm_t initialized nccl comm - :param ucp_worker: size_t initialized ucp_worker_h instance - :param eps: size_t array of initialized ucp_ep_h instances - :param size: int number of workers in cluster - :param rank: int rank of current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + nccl_inst : raft.dask.common.nccl + Initialized nccl comm to use + ucp_worker : size_t pointer to initialized ucp_worker_h instance + eps: size_t pointer to array of initialized ucp_ep_h instances + size : int + Number of workers in cluster + rank : int + Rank of current worker """ cdef size_t *ucp_eps = malloc(len(eps)*sizeof(size_t)) diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx index c9d9fe0426..b72bd3d80b 100644 --- a/python/raft/dask/common/nccl.pyx +++ b/python/raft/dask/common/nccl.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -74,7 +74,10 @@ def unique_id(): Returns a new ncclUniqueId converted to a character array that can be safely serialized and shared to a remote worker. - :return: string a 128-byte unique id string + + Returns + ------- + 128-byte unique id : str """ cdef char *uid = malloc(NCCL_UNIQUE_ID_BYTES * sizeof(char)) get_unique_id(uid, NCCL_UNIQUE_ID_BYTES) @@ -108,16 +111,22 @@ cdef class nccl: def get_unique_id(): """ Returns a new nccl unique id - :return: string nccl unique id + + Returns + ------- + nccl unique id : str """ return unique_id() def init(self, nranks, commId, rank): """ Construct a nccl-py object - :param nranks: int size of clique - :param commId: string unique id from client - :param rank: int rank of current worker + + Parameters + ---------- + nranks : int size of clique + commId : string unique id from client + rank : int rank of current worker """ self.size = nranks self.rank = rank @@ -183,7 +192,10 @@ cdef class nccl: def cu_device(self): """ Get the device backing the underlying comm - :returns int device id + + Returns + ------- + device id : int """ cdef int *dev = malloc(sizeof(int)) @@ -204,7 +216,10 @@ cdef class nccl: def user_rank(self): """ Get the rank id of the current comm - :return: int rank + + Returns + ------- + rank : int """ cdef int *rank = malloc(sizeof(int)) @@ -228,6 +243,9 @@ cdef class nccl: """ Returns the underlying nccl comm in a size_t (similar to void*). This can be safely typecasted from size_t into ncclComm_t* - :return: size_t ncclComm_t instance + + Returns + ------- + ncclComm_t instance pointer : size_t """ return self.comm diff --git a/python/raft/dask/common/ucx.py b/python/raft/dask/common/ucx.py index 948e1433ee..ed45c6ce8f 100644 --- a/python/raft/dask/common/ucx.py +++ b/python/raft/dask/common/ucx.py @@ -24,7 +24,7 @@ class UCX: """ Singleton UCX context to encapsulate all interactions with the UCX-py API and guarantee only a single listener & endpoints are - created by cuML on a single process. + created by RAFT Comms on a single process. """ __instance = None diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py index c92fbf3e47..fdb5acfb5d 100644 --- a/python/raft/dask/common/utils.py +++ b/python/raft/dask/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6eb0325c8cef11a7e74c5f0ff02e955403fd7fe5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:37:16 -0400 Subject: [PATCH 023/189] Removing the last of references to cuml --- cpp/include/raft/comms/ucp_helper.hpp | 2 +- python/raft/dask/common/comms_utils.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 06f687752f..09fc897a79 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -40,7 +40,7 @@ typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, * Standard UCX request object that will be passed * around asynchronously. This object is really * opaque and the comms layer only cares that it - * has been completed. Because cuml comms do not + * has been completed. Because raft comms do not * initialize the ucx application context, it doesn't * own this object and thus it's important not to * modify this struct. diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 90fd97671f..1825d385a7 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -88,7 +88,7 @@ def perform_test_comms_send_recv(handle, n_trials): def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): """ - Given a handle and initialized nccl comm, creates a cumlCommunicator + Given a handle and initialized nccl comm, creates a comms_t instance and injects it into the handle. Parameters @@ -119,7 +119,7 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose): def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size, rank, verbose): """ - Given a handle and initialized comms, creates a cumlCommunicator instance + Given a handle and initialized comms, creates a comms_t instance and injects it into the handle. Parameters From b65e20df6603a00626e15df74326cf17070130a3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:38:08 -0400 Subject: [PATCH 024/189] Fixing python style --- python/raft/common/handle.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx index 3659cb3669..c19cbc8faa 100644 --- a/python/raft/common/handle.pyx +++ b/python/raft/common/handle.pyx @@ -74,8 +74,8 @@ cdef class Handle: """ Issues a sync on the stream set for this handle. - Once we make `raft.common.cuda.Stream` as a mandatory option for creating - `raft.common.Handle`, this should go away + Once we make `raft.common.cuda.Stream` as a mandatory option + for creating `raft.common.Handle`, this should go away """ cdef handle_t* h_ = self.h cdef _Stream stream = h_.get_stream() From 7a845cb2952ac8d1477f691ac9394f51ea67492d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:38:39 -0400 Subject: [PATCH 025/189] Fixing c++ style --- cpp/include/raft/comms/ucp_helper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 09fc897a79..ca9965aed1 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -227,5 +227,5 @@ class comms_ucp_handler { UCS_PTR_STATUS(recv_result)); } }; -} // end namespace comms -} // end namespace raft +} // end namespace comms +} // end namespace raft From c26930074426e5337c8920da100c47c6035222fe Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 16:55:40 -0400 Subject: [PATCH 026/189] Updating changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 241d214f49..d72239e926 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ## New Features - Initial RAFT version - PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes +- PR #7: Migrating cuml comms -> raft comms_t ## Improvements From d4aa5c5cc58ba8754fa8d61bb95132b572c223cd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 13 May 2020 17:06:53 -0400 Subject: [PATCH 027/189] Testing non-ucx cluster for pytests --- python/raft/test/test_comms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index c95bb86b65..9fb735c361 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -103,9 +103,9 @@ def test_allreduce(cluster): @pytest.mark.ucx @pytest.mark.parametrize("n_trials", [1, 5]) -def test_send_recv(n_trials, ucx_cluster): +def test_send_recv(n_trials, cluster): - client = Client(ucx_cluster) + client = Client(cluster) try: From 1ee1363b3740bcba1570812773580f9be575d483 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 18 May 2020 19:54:09 -0400 Subject: [PATCH 028/189] Implementing review feedback --- cpp/include/raft/comms/comms.hpp | 74 ++++----- cpp/include/raft/comms/comms_helper.hpp | 25 +++- cpp/include/raft/comms/std_comms.hpp | 190 ++++++++++++------------ cpp/include/raft/comms/ucp_helper.hpp | 4 +- 4 files changed, 154 insertions(+), 139 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 3a61207214..4d97596170 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -22,13 +22,13 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum datatype_t { CHAR, UINT8, INT, UINT, INT64, UINT64, FLOAT, DOUBLE }; -enum op_t { SUM, PROD, MIN, MAX }; +enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; +enum class op_t { SUM, PROD, MIN, MAX }; /** * The resulting status of distributed stream synchronization */ -enum status_t { +enum class status_t { commStatusSuccess, // Synchronization successful commStatusError, // An error occured querying sync status commStatusAbort // A failure occurred in sync, queued operations aborted @@ -47,105 +47,105 @@ class comms_iface { virtual status_t syncStream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, int size, int dest, int tag, + virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; - virtual void irecv(void* buf, int size, int source, int tag, + virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, void* recvbuff, int count, + virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const = 0; - virtual void bcast(void* buff, int count, datatype_t datatype, int root, + virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, void* recvbuff, int count, + virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, void* recvbuff, int sendcount, + virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const = 0; virtual void allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], const int displs[], + const size_t recvcounts[], const int displs[], datatype_t datatype, cudaStream_t stream) const = 0; virtual void reducescatter(const void* sendbuff, void* recvbuff, - int recvcount, datatype_t datatype, op_t op, + size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const = 0; }; -class comms_t : public comms_iface { +class comms_t: comms_iface { public: - comms_t(std::unique_ptr impl) : _impl(impl.release()) { - ASSERT(nullptr != _impl.get(), "ERROR: Invalid comms_iface used!"); + comms_t(std::unique_ptr impl) : impl_(impl.release()) { + ASSERT(nullptr != impl.get(), "ERROR: Invalid comms_iface used!"); } - int getSize() const { return _impl->getSize(); } + int getSize() const { return impl_->getSize(); } - int getRank() const { return _impl->getRank(); } + int getRank() const { return impl_->getRank(); } std::unique_ptr commSplit(int color, int key) const { - return _impl->commSplit(color, key); + return impl_->commSplit(color, key); } - void barrier() const { _impl->barrier(); } + void barrier() const { impl_->barrier(); } status_t syncStream(cudaStream_t stream) const { - return _impl->syncStream(stream); + return impl_->syncStream(stream); } - void isend(const void* buf, int size, int dest, int tag, + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const { - _impl->isend(buf, size, dest, tag, request); + impl_->isend(buf, size, dest, tag, request); } - void irecv(void* buf, int size, int source, int tag, + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const { - _impl->irecv(buf, size, source, tag, request); + impl_->irecv(buf, size, source, tag, request); } void waitall(int count, request_t array_of_requests[]) const { - _impl->waitall(count, array_of_requests); + impl_->waitall(count, array_of_requests); } - void allreduce(const void* sendbuff, void* recvbuff, int count, + void allreduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { - _impl->allreduce(sendbuff, recvbuff, count, datatype, op, stream); + impl_->allreduce(sendbuff, recvbuff, count, datatype, op, stream); } - void bcast(void* buff, int count, datatype_t datatype, int root, + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - _impl->bcast(buff, count, datatype, root, stream); + impl_->bcast(buff, count, datatype, root, stream); } - void reduce(const void* sendbuff, void* recvbuff, int count, + void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { - _impl->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + impl_->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); } - void allgather(const void* sendbuff, void* recvbuff, int sendcount, + void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { - _impl->allgather(sendbuff, recvbuff, sendcount, datatype, stream); + impl_->allgather(sendbuff, recvbuff, sendcount, datatype, stream); } - void allgatherv(const void* sendbuf, void* recvbuf, const int recvcounts[], + void allgatherv(const void* sendbuf, void* recvbuf, const size_t recvcounts[], const int displs[], datatype_t datatype, cudaStream_t stream) const { - _impl->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); + impl_->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); } - void reducescatter(const void* sendbuff, void* recvbuff, int recvcount, + void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { - _impl->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); + impl_->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); } private: - std::unique_ptr _impl; + std::unique_ptr impl_; }; comms_iface::~comms_iface() {} diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 2e71009eaf..1f0e772a5e 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -24,35 +24,46 @@ namespace raft { namespace comms { +/** + * Function to construct comms_t and inject it on a handle_t. This + * is used for convenience in the Python layer. + */ void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, int rank) { - auto *raft_comm = new raft::comms::std_comms(comm, size, rank); + + auto d_alloc = handle->get_device_allocator(); + auto *raft_comm = new raft::comms::std_comms(comm, size, rank, d_alloc); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } +/** + * Function to construct comms_t and inject it on a handle_t. This + * is used for convenience in the Python layer. + */ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, void *eps, int size, int rank) { - std::shared_ptr eps_sp = + auto eps_sp = std::make_shared(new ucp_ep_h[size]); - size_t *size_t_ep_arr = (size_t *)eps; + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < size; i++) { size_t ptr = size_t_ep_arr[i]; - ucp_ep_h *ucp_ep_v = (ucp_ep_h *)*eps_sp; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { - ucp_ep_h eps_ptr = (ucp_ep_h)size_t_ep_arr[i]; + auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } } + auto d_alloc = handle->get_device_allocator(); auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, - eps_sp, size, rank); + eps_sp, size, rank, d_alloc); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); @@ -69,7 +80,7 @@ bool test_collective_allreduce(const handle_t &handle) { temp_d.resize(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT, SUM, + communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT32, op_t::SUM, stream); int temp_h = 0; CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 2af64a262b..9b6dd91237 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -66,55 +66,55 @@ namespace comms { size_t getDatatypeSize(const datatype_t datatype) { switch (datatype) { - case CHAR: + case datatype_t::CHAR: return sizeof(char); - case UINT8: + case datatype_t::UINT8: return sizeof(uint8_t); - case INT: + case datatype_t::INT32: return sizeof(int); - case UINT: + case datatype_t::UINT32: return sizeof(unsigned int); - case INT64: + case datatype_t::INT64: return sizeof(int64_t); - case UINT64: + case datatype_t::UINT64: return sizeof(uint64_t); - case FLOAT: + case datatype_t::FLOAT32: return sizeof(float); - case DOUBLE: + case datatype_t::FLOAT64: return sizeof(double); } } ncclDataType_t getNCCLDatatype(const datatype_t datatype) { switch (datatype) { - case CHAR: + case datatype_t::CHAR: return ncclChar; - case UINT8: + case datatype_t::UINT8: return ncclUint8; - case INT: + case datatype_t::INT32: return ncclInt; - case UINT: + case datatype_t::UINT32: return ncclUint32; - case INT64: + case datatype_t::INT64: return ncclInt64; - case UINT64: + case datatype_t::UINT64: return ncclUint64; - case FLOAT: + case datatype_t::FLOAT32: return ncclFloat; - case DOUBLE: + case datatype_t::FLOAT64: return ncclDouble; } } ncclRedOp_t getNCCLOp(const op_t op) { switch (op) { - case SUM: + case op_t::SUM: return ncclSum; - case PROD: + case op_t::PROD: return ncclProd; - case MIN: + case op_t::MIN: return ncclMin; - case MAX: + case op_t::MAX: return ncclMax; } } @@ -131,16 +131,17 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(ncclComm_t comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int size, int rank) - : _nccl_comm(comm), - _ucp_worker(ucp_worker), - _ucp_eps(eps), - _size(size), - _rank(rank), - _next_request_id(0) { + std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, + std::shared_ptr eps, int num_ranks, int rank, + const std::shared_ptr device_allocator) + : nccl_comm_(nccl_comm), + ucp_worker_(ucp_worker), + ucp_eps_(eps), + num_ranks_(num_ranks), + rank_(rank), + device_allocator_(device_allocator), + next_request_id_(0) { initialize(); - p2p_enabled = true; }; /** @@ -149,28 +150,30 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(ncclComm_t comm, int size, int rank) - : _nccl_comm(comm), _size(size), _rank(rank) { + std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, + const std::shared_ptr device_allocator) + : nccl_comm_(nccl_comm), num_ranks_(num_ranks), rank_(rank), + device_allocator_(device_allocator){ initialize(); }; virtual ~std_comms() { - CUDA_CHECK_NO_THROW(cudaStreamDestroy(_stream)); + CUDA_CHECK_NO_THROW(cudaStreamDestroy(stream_)); - CUDA_CHECK_NO_THROW(cudaFree(_sendbuff)); - CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); + device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); + device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); } void initialize() { - CUDA_CHECK(cudaStreamCreate(&_stream)); + CUDA_CHECK(cudaStreamCreate(&stream_)); - CUDA_CHECK(cudaMalloc(&_sendbuff, sizeof(int))); - CUDA_CHECK(cudaMalloc(&_recvbuff, sizeof(int))); + sendbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); + recvbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); } - int getSize() const { return _size; } + int getSize() const { return num_ranks_; } - int getRank() const { return _rank; } + int getRank() const { return rank_; } std::unique_ptr commSplit(int color, int key) const { // Not supported by NCCL @@ -180,64 +183,64 @@ class std_comms : public comms_iface { } void barrier() const { - CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); - CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); + CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); + CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); - allreduce(_sendbuff, _recvbuff, 1, INT, SUM, _stream); + allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_); - ASSERT(syncStream(_stream) == status_t::commStatusSuccess, - "ERROR: syncStream failed. This can be caused by a failed rank."); + ASSERT(syncStream(stream_) == status_t::commStatusSuccess, + "ERROR: syncStream failed. This can be caused by a failed rank_."); } void get_request_id(request_t *req) const { request_t req_id; - if (this->_free_requests.empty()) - req_id = this->_next_request_id++; + if (this->free_requests_.empty()) + req_id = this->next_request_id_++; else { - auto it = this->_free_requests.begin(); + auto it = this->free_requests_.begin(); req_id = *it; - this->_free_requests.erase(it); + this->free_requests_.erase(it); } *req = req_id; } - void isend(const void *buf, int size, int dest, int tag, + void isend(const void *buf, size_t size, int dest, int tag, request_t *request) const { - ASSERT(_ucp_worker != nullptr, + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); - ucp_ep_h ep_ptr = (*_ucp_eps)[dest]; + ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - this->_ucp_handler.ucp_isend(ucp_req, ep_ptr, buf, size, tag, + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, getRank()); - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); + requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void *buf, int size, int source, int tag, + void irecv(void *buf, size_t size, int source, int tag, request_t *request) const { - ASSERT(_ucp_worker != nullptr, + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); - ucp_ep_h ep_ptr = (*_ucp_eps)[source]; + ucp_ep_h ep_ptr = (*ucp_eps_)[source]; ucp_tag_t tag_mask = default_tag_mask; ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); + requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } void waitall(int count, request_t array_of_requests[]) const { - ASSERT(_ucp_worker != nullptr, + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); std::vector requests; @@ -246,12 +249,12 @@ class std_comms : public comms_iface { time_t start = time(NULL); for (int i = 0; i < count; ++i) { - auto req_it = _requests_in_flight.find(array_of_requests[i]); - ASSERT(_requests_in_flight.end() != req_it, + auto req_it = requests_in_flight_.find(array_of_requests[i]); + ASSERT(requests_in_flight_.end() != req_it, "ERROR: waitall on invalid request: %d", array_of_requests[i]); requests.push_back(req_it->second); - _free_requests.insert(req_it->first); - _requests_in_flight.erase(req_it); + free_requests_.insert(req_it->first); + requests_in_flight_.erase(req_it); } while (requests.size() > 0) { @@ -266,7 +269,7 @@ class std_comms : public comms_iface { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue - while (_ucp_handler.ucp_progress(_ucp_worker) != 0) { + while (ucp_handler_.ucp_progress(ucp_worker_) != 0) { restart = true; } @@ -291,7 +294,7 @@ class std_comms : public comms_iface { restart = true; // perform cleanup - _ucp_handler.free_ucp_request(req); + ucp_handler_.free_ucp_request(req); // remove from pending requests it = requests.erase(it); @@ -306,50 +309,50 @@ class std_comms : public comms_iface { } } - void allreduce(const void *sendbuff, void *recvbuff, int count, + void allreduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), getNCCLOp(op), - _nccl_comm, stream)); + nccl_comm_, stream)); } - void bcast(void *buff, int count, datatype_t datatype, int root, + void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, - _nccl_comm, stream)); + nccl_comm_, stream)); } - void reduce(const void *sendbuff, void *recvbuff, int count, + void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), root, _nccl_comm, stream)); + getNCCLOp(op), root, nccl_comm_, stream)); } - void allgather(const void *sendbuff, void *recvbuff, int sendcount, + void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - getNCCLDatatype(datatype), _nccl_comm, stream)); + getNCCLDatatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void *sendbuf, void *recvbuf, const int recvcounts[], + void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[], const int displs[], datatype_t datatype, cudaStream_t stream) const { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. - for (int root = 0; root < _size; ++root) { + for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = getDatatypeSize(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); + recvcounts[root], getNCCLDatatype(datatype), root, nccl_comm_, stream)); } } - void reducescatter(const void *sendbuff, void *recvbuff, int recvcount, + void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, getNCCLDatatype(datatype), getNCCLOp(op), - _nccl_comm, stream)); + nccl_comm_, stream)); } status_t syncStream(cudaStream_t stream) const { @@ -360,11 +363,11 @@ class std_comms : public comms_iface { if (cudaErr == cudaSuccess) return status_t::commStatusSuccess; if (cudaErr != cudaErrorNotReady) { - // An error occurred querying the status of the stream + // An error occurred querying the status of the stream_ return status_t::commStatusError; } - ncclErr = ncclCommGetAsyncError(_nccl_comm, &ncclAsyncErr); + ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr); if (ncclErr != ncclSuccess) { // An error occurred retrieving the asynchronous error return status_t::commStatusError; @@ -373,7 +376,7 @@ class std_comms : public comms_iface { if (ncclAsyncErr != ncclSuccess) { // An asynchronous error happened. Stop the operation and destroy // the communicator - ncclErr = ncclCommAbort(_nccl_comm); + ncclErr = ncclCommAbort(nccl_comm_); if (ncclErr != ncclSuccess) // Caller may abort with an exception or try to re-create a new communicator. return status_t::commStatusAbort; @@ -385,22 +388,23 @@ class std_comms : public comms_iface { } private: - ncclComm_t _nccl_comm; - cudaStream_t _stream; + ncclComm_t nccl_comm_; + cudaStream_t stream_; - int *_sendbuff, *_recvbuff; + int *sendbuff_, *recvbuff_; - int _size; - int _rank; + int num_ranks_; + int rank_; - bool p2p_enabled = false; - comms_ucp_handler _ucp_handler; - ucp_worker_h _ucp_worker; - std::shared_ptr _ucp_eps; - mutable request_t _next_request_id; + comms_ucp_handler ucp_handler_; + ucp_worker_h ucp_worker_; + std::shared_ptr ucp_eps_; + mutable request_t next_request_id_; mutable std::unordered_map - _requests_in_flight; - mutable std::unordered_set _free_requests; + requests_in_flight_; + mutable std::unordered_set free_requests_; + + std::shared_ptr device_allocator_; }; } // end namespace comms } // end namespace raft diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index ca9965aed1..47f3f4d61e 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -172,7 +172,7 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, int size, + void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, size_t size, int tag, ucp_tag_t tag_mask, int rank) const { ucp_tag_t ucp_tag = build_message_tag(rank, tag); @@ -207,7 +207,7 @@ class comms_ucp_handler { * @brief Asynchronously receive data from given endpoint with the given tag. */ void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, int size, int tag, ucp_tag_t tag_mask, + void *buf, size_t size, int tag, ucp_tag_t tag_mask, int sender_rank) const { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); From c165a36bff343c1c9cd035e7663d5abb30eb145c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 18 May 2020 20:08:45 -0400 Subject: [PATCH 029/189] More review feedback --- cpp/include/raft/comms/comms.hpp | 2 +- cpp/include/raft/comms/std_comms.hpp | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 4d97596170..54d11f38ba 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -78,7 +78,7 @@ class comms_iface { cudaStream_t stream) const = 0; }; -class comms_t: comms_iface { +class comms_t { public: comms_t(std::unique_ptr impl) : impl_(impl.release()) { ASSERT(nullptr != impl.get(), "ERROR: Invalid comms_iface used!"); diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 9b6dd91237..7a49816242 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,7 +64,8 @@ namespace raft { namespace comms { -size_t getDatatypeSize(const datatype_t datatype) { + +constexpr size_t get_datatype_size(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return sizeof(char); @@ -85,7 +86,7 @@ size_t getDatatypeSize(const datatype_t datatype) { } } -ncclDataType_t getNCCLDatatype(const datatype_t datatype) { +ncclDataType_t get_nccl_datatype(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return ncclChar; @@ -106,7 +107,7 @@ ncclDataType_t getNCCLDatatype(const datatype_t datatype) { } } -ncclRedOp_t getNCCLOp(const op_t op) { +ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { case op_t::SUM: return ncclSum; @@ -312,27 +313,27 @@ class std_comms : public comms_iface { void allreduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, - getNCCLDatatype(datatype), getNCCLOp(op), + get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, + NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), root, nccl_comm_, stream)); + NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), + get_nccl_op(op), root, nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - getNCCLDatatype(datatype), nccl_comm_, stream)); + get_nccl_datatype(datatype), nccl_comm_, stream)); } void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[], @@ -341,17 +342,17 @@ class std_comms : public comms_iface { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { - size_t dtype_size = getDatatypeSize(datatype); + size_t dtype_size = get_datatype_size(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], getNCCLDatatype(datatype), root, nccl_comm_, stream)); + recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, stream)); } } void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - getNCCLDatatype(datatype), getNCCLOp(op), + get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } From 12f3db7b5a75ef9bff3e9f8c19f864128ef7f59d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 18 May 2020 20:14:43 -0400 Subject: [PATCH 030/189] Fixing style --- cpp/include/raft/comms/comms.hpp | 13 ++++++++++-- cpp/include/raft/comms/comms_helper.hpp | 10 ++++----- cpp/include/raft/comms/std_comms.hpp | 27 +++++++++++++++---------- cpp/include/raft/comms/ucp_helper.hpp | 4 ++-- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 54d11f38ba..11ad32cfcc 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -22,7 +22,16 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; +enum class datatype_t { + CHAR, + UINT8, + INT32, + UINT32, + INT64, + UINT64, + FLOAT32, + FLOAT64 +}; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -74,7 +83,7 @@ class comms_iface { datatype_t datatype, cudaStream_t stream) const = 0; virtual void reducescatter(const void* sendbuff, void* recvbuff, - size_t recvcount, datatype_t datatype, op_t op, + size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const = 0; }; diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 1f0e772a5e..2238226383 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -30,7 +30,6 @@ namespace comms { */ void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, int rank) { - auto d_alloc = handle->get_device_allocator(); auto *raft_comm = new raft::comms::std_comms(comm, size, rank, d_alloc); auto communicator = @@ -44,10 +43,9 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, */ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, void *eps, int size, int rank) { - auto eps_sp = - std::make_shared(new ucp_ep_h[size]); + auto eps_sp = std::make_shared(new ucp_ep_h[size]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < size; i++) { size_t ptr = size_t_ep_arr[i]; @@ -80,8 +78,8 @@ bool test_collective_allreduce(const handle_t &handle) { temp_d.resize(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT32, op_t::SUM, - stream); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT32, + op_t::SUM, stream); int temp_h = 0; CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7a49816242..e806740207 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,7 +64,6 @@ namespace raft { namespace comms { - constexpr size_t get_datatype_size(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: @@ -152,9 +151,11 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - const std::shared_ptr device_allocator) - : nccl_comm_(nccl_comm), num_ranks_(num_ranks), rank_(rank), - device_allocator_(device_allocator){ + const std::shared_ptr device_allocator) + : nccl_comm_(nccl_comm), + num_ranks_(num_ranks), + rank_(rank), + device_allocator_(device_allocator) { initialize(); }; @@ -168,8 +169,10 @@ class std_comms : public comms_iface { void initialize() { CUDA_CHECK(cudaStreamCreate(&stream_)); - sendbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); + sendbuff_ = reinterpret_cast( + device_allocator_->allocate(sizeof(int), stream_)); + recvbuff_ = reinterpret_cast( + device_allocator_->allocate(sizeof(int), stream_)); } int getSize() const { return num_ranks_; } @@ -319,15 +322,16 @@ class std_comms : public comms_iface { void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), + root, nccl_comm_, stream)); } void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), get_nccl_op(op), root, + nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, @@ -345,7 +349,8 @@ class std_comms : public comms_iface { size_t dtype_size = get_datatype_size(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, stream)); + recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, + stream)); } } diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 47f3f4d61e..0cc1df1c36 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -172,8 +172,8 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, size_t size, - int tag, ucp_tag_t tag_mask, int rank) const { + void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, + size_t size, int tag, ucp_tag_t tag_mask, int rank) const { ucp_tag_t ucp_tag = build_message_tag(rank, tag); ucs_status_ptr_t send_result = (*(send_func))( From aae4625361664d0ae7ff2a554d72ffafa3825d70 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 19 May 2020 10:23:24 -0500 Subject: [PATCH 031/189] FIX Use relative imports --- python/raft/common/handle.pxd | 6 +++--- python/raft/common/handle.pyx | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index dfdcdb929b..3b60e6708e 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -21,7 +21,7 @@ from libcpp.memory cimport shared_ptr -cimport raft.common.cuda +from .cuda cimport _Stream cdef extern from "raft/mr/device/allocator.hpp" \ @@ -33,7 +33,7 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: cdef cppclass handle_t: handle_t() except + handle_t(int ns) except + - void set_stream(raft.common.cuda._Stream s) except + + void set_stream(_Stream s) except + void set_device_allocator(shared_ptr[allocator] a) except + - raft.common.cuda._Stream get_stream() except + + _Stream get_stream() except + int get_num_internal_streams() except + diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx index c19cbc8faa..6658a825ce 100644 --- a/python/raft/common/handle.pyx +++ b/python/raft/common/handle.pyx @@ -19,9 +19,11 @@ # cython: embedsignature = True # cython: language_level = 3 -import raft +# import raft from libcpp.memory cimport shared_ptr -from raft.common.cuda cimport _Stream, _Error, cudaStreamSynchronize + +from .cuda cimport _Stream, _Error, cudaStreamSynchronize +from .cuda import CudaRuntimeError cdef class Handle: """ @@ -81,7 +83,7 @@ cdef class Handle: cdef _Stream stream = h_.get_stream() cdef _Error e = cudaStreamSynchronize(stream) if e != 0: - raise raft.cuda.CudaRuntimeError("Stream sync") + raise CudaRuntimeError("Stream sync") def getHandle(self): return self.h From bc3932161c6e58bb2054f46b7eacbf682501d246 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 11:33:36 -0400 Subject: [PATCH 032/189] Adding compile-time templates for comms_t to make interaction more straightforward --- cpp/include/raft/comms/comms.hpp | 149 ++++++++++++++++++++---- cpp/include/raft/comms/comms_helper.hpp | 17 +-- cpp/include/raft/comms/std_comms.hpp | 23 ++-- python/setup.py | 2 +- 4 files changed, 150 insertions(+), 41 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 11ad32cfcc..f76c456647 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -43,6 +43,7 @@ enum class status_t { commStatusAbort // A failure occurred in sync, queued operations aborted }; + class comms_iface { public: virtual ~comms_iface(); @@ -57,30 +58,30 @@ class comms_iface { virtual status_t syncStream(cudaStream_t stream) const = 0; virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const = 0; + request_t* request) const=0; virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; + datatype_t datatype, cudaStream_t stream) const = 0; virtual void allgatherv(const void* sendbuf, void* recvbuf, const size_t recvcounts[], const int displs[], - datatype_t datatype, cudaStream_t stream) const = 0; + datatype_t datatype, cudaStream_t stream) const = 0; virtual void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, datatype_t datatype, op_t op, @@ -90,9 +91,12 @@ class comms_iface { class comms_t { public: comms_t(std::unique_ptr impl) : impl_(impl.release()) { - ASSERT(nullptr != impl.get(), "ERROR: Invalid comms_iface used!"); + ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } + template + datatype_t get_type() const; + int getSize() const { return impl_->getSize(); } int getRank() const { return impl_->getRank(); } @@ -108,49 +112,111 @@ class comms_t { } void isend(const void* buf, size_t size, int dest, int tag, + request_t* request) const { + impl_->isend(buf, size, dest, tag, request); + } + + template + void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const { - impl_->isend(buf, size, dest, tag, request); + isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } void irecv(void* buf, size_t size, int source, int tag, + request_t* request) const { + impl_->irecv(buf, size, source, tag, request); + } + + template + void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const { - impl_->irecv(buf, size, source, tag, request); + irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } void waitall(int count, request_t array_of_requests[]) const { - impl_->waitall(count, array_of_requests); + impl_->waitall(count, array_of_requests); } void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - impl_->allreduce(sendbuff, recvbuff, count, datatype, op, stream); + datatype_t datatype, op_t op, cudaStream_t stream) const { + + impl_->allreduce(sendbuff, recvbuff, count, datatype, op, stream); + } + + template + void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, + op_t op, cudaStream_t stream) const { + allreduce(static_cast(sendbuff), static_cast(recvbuff) + , sizeof(value_t)*count, get_type(), op, stream); } void bcast(void* buff, size_t count, datatype_t datatype, int root, + cudaStream_t stream) const { + + impl_->bcast(buff, count, datatype , root, stream); + } + + template + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - impl_->bcast(buff, count, datatype, root, stream); + bcast(static_cast(buff), count, get_type() , root, stream); } void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { + + impl_->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + } + + template + void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, + op_t op, int root, cudaStream_t stream) const { - impl_->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + reduce(static_cast(sendbuff), static_cast(recvbuff), + count, get_type(), op, root, stream); } void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - impl_->allgather(sendbuff, recvbuff, sendcount, datatype, stream); + datatype_t datatype, cudaStream_t stream) const { + + impl_->allgather(sendbuff, recvbuff, + sendcount, datatype, stream); + } + + template + void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, + cudaStream_t stream) const { + allgather(static_cast(sendbuff), static_cast(recvbuff), + sendcount, get_type(), stream); } void allgatherv(const void* sendbuf, void* recvbuf, const size_t recvcounts[], - const int displs[], datatype_t datatype, + const int displs[], datatype_t datatype, + cudaStream_t stream) const { + + impl_->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); + } + + template + void allgatherv(const value_t* sendbuf, value_t* recvbuf, const size_t recvcounts[], + const int displs[], cudaStream_t stream) const { - impl_->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); + allgatherv(static_cast(sendbuf), static_cast(recvbuf), recvcounts, displs, get_type(), stream); } void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - impl_->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); + datatype_t datatype, op_t op, cudaStream_t stream) const { + + impl_->reducescatter(sendbuff, recvbuff, + recvcount, datatype , op, stream); + } + + template + void reducescatter(const value_t* sendbuff, value_t* recvbuff, size_t recvcount, + op_t op, cudaStream_t stream) const { + reducescatter(static_cast(sendbuff), static_cast(recvbuff), + recvcount, get_type() , op, stream); } private: @@ -159,5 +225,44 @@ class comms_t { comms_iface::~comms_iface() {} +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::CHAR; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::UINT8; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::INT32; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::UINT32; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::INT64; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::UINT64; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::FLOAT32; +} + +template <> +constexpr datatype_t comms_t::get_type() const { + return datatype_t::FLOAT64; +} } // namespace comms } // namespace raft diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 2238226383..4edfc4b6c7 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -31,7 +31,9 @@ namespace comms { void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, int rank) { auto d_alloc = handle->get_device_allocator(); - auto *raft_comm = new raft::comms::std_comms(comm, size, rank, d_alloc); + raft::comms::comms_iface *raft_comm = new raft::comms::std_comms(comm, size, rank, d_alloc); + std::cout << "Comms: " << raft_comm->getSize() << std::endl; + auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); @@ -62,6 +64,7 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, auto d_alloc = handle->get_device_allocator(); auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank, d_alloc); + std::cout << "Comms: " << raft_comm << std::endl; auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); @@ -76,12 +79,12 @@ bool test_collective_allreduce(const handle_t &handle) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, datatype_t::INT32, - op_t::SUM, stream); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, + datatype_t::INT32, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -106,7 +109,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { //post receives for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, sizeof(int), r, + communicator.irecv(received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); ++request_idx; } @@ -114,7 +117,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.isend(&rank, sizeof(int), r, 0, + communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); ++request_idx; } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index e806740207..d885853f1d 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,7 +64,8 @@ namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { +constexpr size_t get_size(const datatype_t datatype) { + switch (datatype) { case datatype_t::CHAR: return sizeof(char); @@ -85,7 +86,7 @@ constexpr size_t get_datatype_size(const datatype_t datatype) { } } -ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_type(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return ncclChar; @@ -106,7 +107,7 @@ ncclDataType_t get_nccl_datatype(const datatype_t datatype) { } } -ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { case op_t::SUM: return ncclSum; @@ -159,7 +160,7 @@ class std_comms : public comms_iface { initialize(); }; - virtual ~std_comms() { + ~std_comms() { CUDA_CHECK_NO_THROW(cudaStreamDestroy(stream_)); device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); @@ -316,13 +317,13 @@ class std_comms : public comms_iface { void allreduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), + get_nccl_type(datatype), get_nccl_op(op), nccl_comm_, stream)); } void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), + NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_type(datatype), root, nccl_comm_, stream)); } @@ -330,14 +331,14 @@ class std_comms : public comms_iface { datatype_t datatype, op_t op, int root, cudaStream_t stream) const { NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), root, + get_nccl_type(datatype), get_nccl_op(op), root, nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + get_nccl_type(datatype), nccl_comm_, stream)); } void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[], @@ -346,10 +347,10 @@ class std_comms : public comms_iface { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { - size_t dtype_size = get_datatype_size(datatype); + size_t dtype_size = get_size(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, + recvcounts[root], get_nccl_type(datatype), root, nccl_comm_, stream)); } } @@ -357,7 +358,7 @@ class std_comms : public comms_iface { void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), + get_nccl_type(datatype), get_nccl_op(op), nccl_comm_, stream)); } diff --git a/python/setup.py b/python/setup.py index 107a061bd9..4f4e6c8b96 100644 --- a/python/setup.py +++ b/python/setup.py @@ -112,7 +112,7 @@ os.path.join(os.sys.prefix, "lib")], libraries=libs, language='c++', - extra_compile_args=['-std=c++11']) + extra_compile_args=['-std=c++14']) ] From 3150fbd7a8a7803aef999993b5ad23562330aba0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 11:40:41 -0400 Subject: [PATCH 033/189] Using std::this_thread::yield instead of pthread_yield() --- cpp/include/raft/comms/std_comms.hpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index d885853f1d..e4e59115f1 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,8 +64,7 @@ namespace raft { namespace comms { -constexpr size_t get_size(const datatype_t datatype) { - +constexpr size_t get_datatype_size(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return sizeof(char); @@ -86,7 +85,7 @@ constexpr size_t get_size(const datatype_t datatype) { } } -constexpr ncclDataType_t get_nccl_type(const datatype_t datatype) { +ncclDataType_t get_nccl_datatype(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return ncclChar; @@ -107,7 +106,7 @@ constexpr ncclDataType_t get_nccl_type(const datatype_t datatype) { } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { case op_t::SUM: return ncclSum; @@ -160,7 +159,7 @@ class std_comms : public comms_iface { initialize(); }; - ~std_comms() { + virtual ~std_comms() { CUDA_CHECK_NO_THROW(cudaStreamDestroy(stream_)); device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); @@ -317,13 +316,13 @@ class std_comms : public comms_iface { void allreduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_type(datatype), get_nccl_op(op), + get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_type(datatype), + NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } @@ -331,14 +330,14 @@ class std_comms : public comms_iface { datatype_t datatype, op_t op, int root, cudaStream_t stream) const { NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, - get_nccl_type(datatype), get_nccl_op(op), root, + get_nccl_datatype(datatype), get_nccl_op(op), root, nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_type(datatype), nccl_comm_, stream)); + get_nccl_datatype(datatype), nccl_comm_, stream)); } void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[], @@ -347,10 +346,10 @@ class std_comms : public comms_iface { //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf //Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { - size_t dtype_size = get_size(datatype); + size_t dtype_size = get_datatype_size(datatype); NCCL_CHECK(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_type(datatype), root, nccl_comm_, + recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, stream)); } } @@ -358,7 +357,7 @@ class std_comms : public comms_iface { void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_type(datatype), get_nccl_op(op), + get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } @@ -390,7 +389,7 @@ class std_comms : public comms_iface { } // Let other threads (including NCCL threads) use the CPU. - pthread_yield(); + std::this_thread::yield(); } } From 5628ad230d7c0035089666ec069e11a1232a4923 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 14:26:15 -0400 Subject: [PATCH 034/189] Adding python tests for collective functions --- cpp/include/raft/comms/comms.hpp | 174 +++++++++++++++-------- cpp/include/raft/comms/comms_helper.hpp | 176 ++++++++++++++++++++++-- cpp/include/raft/comms/std_comms.hpp | 1 + python/raft/dask/common/__init__.py | 5 +- python/raft/dask/common/comms_utils.pyx | 62 ++++++++- python/raft/test/test_comms.py | 31 +++-- 6 files changed, 365 insertions(+), 84 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index f76c456647..5861e159ab 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -52,7 +52,6 @@ class comms_iface { virtual int getRank() const = 0; virtual std::unique_ptr commSplit(int color, int key) const = 0; - virtual void barrier() const = 0; virtual status_t syncStream(cudaStream_t stream) const = 0; @@ -97,125 +96,180 @@ class comms_t { template datatype_t get_type() const; + /** + * Returns the size of the communicator clique + */ + int getSize() const { return impl_->getSize(); } + + /** + * Returns the local rank + */ int getRank() const { return impl_->getRank(); } + /** + * Splits the current communicator clique into sub-cliques matching + * the given color and key + * + * @param color ranks w/ the same color are placed in the same communicator + * @param key controls rank assignment + */ std::unique_ptr commSplit(int color, int key) const { return impl_->commSplit(color, key); } + /** + * Performs a collective barrier synchronization + */ void barrier() const { impl_->barrier(); } + /** + * Some collective communications implementations (eg. NCCL) might use asynchronous + * collectives that are explicitly synchronized. It's important to always synchronize + * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`, + * to prevent the potential for deadlocks. + * + * @param stream the cuda stream to sync collective operations on + */ status_t syncStream(cudaStream_t stream) const { return impl_->syncStream(stream); } - void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const { - impl_->isend(buf, size, dest, tag, request); - } - + /** + * Performs an asynchronous point-to-point send + * @tparam value_t the type of data to send + * @param buf pointer to array of data to send + * @param size number of elements in buf + * @param dest destination rank + * @param tag a tag to use for the receiver to filter + * @param request pointer to hold returned request_t object. + * This will be used in `waitall()` to synchronize until the message is delivered (or fails). + */ template void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const { - isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); - } - - void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const { - impl_->irecv(buf, size, source, tag, request); + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } + /** + * Performs an asynchronous point-to-point receive + * @tparam value_t the type of data to be received + * @param buf pointer to (initialized) array that will hold received data + * @param size number of elements in buf + * @param source source rank + * @param tag a tag to use for message filtering + * @param request pointer to hold returned request_t object. + * This will be used in `waitall()` to synchronize until the message is delivered (or fails). + */ template void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const { - irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } + /** + * Synchronize on an array of request_t objects returned from isend/irecv + * @param count number of requests to synchronize on + * @param array_of_requests an array of request_t objects returned from isend/irecv + */ void waitall(int count, request_t array_of_requests[]) const { impl_->waitall(count, array_of_requests); } - void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - - impl_->allreduce(sendbuff, recvbuff, count, datatype, op, stream); - } - + /** + * Perform an allreduce collective + * @tparam value_t datatype of underlying buffers + * @param sendbuff data to reduce + * @param recvbuff buffer to hold the reduced result + * @param count number of elements in sendbuff + * @param op reduction operation to perform + * @param stream CUDA stream to synchronize operation + */ template void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const { - allreduce(static_cast(sendbuff), static_cast(recvbuff) - , sizeof(value_t)*count, get_type(), op, stream); - } - - void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - - impl_->bcast(buff, count, datatype , root, stream); + impl_->allreduce(static_cast(sendbuff), static_cast(recvbuff) + , count, get_type(), op, stream); } + /** + * Broadcast data from one rank to the rest + * @tparam value_t datatype of underlying buffers + * @param buff buffer to send + * @param count number of elements if buff + * @param root the rank initiating the broadcast + * @param stream CUDA stream to synchronize operation + */ template void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - bcast(static_cast(buff), count, get_type() , root, stream); - } - - void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - - impl_->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); + impl_->bcast(static_cast(buff), count, get_type(), root, stream); } + /** + * Reduce data from many ranks down to a single rank + * @tparam value_t datatype of underlying buffers + * @param sendbuff buffer containing data to reduce + * @param recvbuff buffer containing reduced data (only needs to be initialized on root) + * @param count number of elements in sendbuff + * @param op reduction operation to perform + * @param root rank to store the results + * @param stream CUDA stream to synchronize operation + */ template void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, int root, cudaStream_t stream) const { - reduce(static_cast(sendbuff), static_cast(recvbuff), + impl_->reduce(static_cast(sendbuff), static_cast(recvbuff), count, get_type(), op, root, stream); } - void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - - impl_->allgather(sendbuff, recvbuff, - sendcount, datatype, stream); - } - + /** + * Gathers data from each rank onto all ranks + * @tparam value_t datatype of underlying buffers + * @param sendbuff buffer containing data to gather + * @param recvbuff buffer containing gathered data from all ranks + * @param sendcount number of elements in send buffer + * @param stream CUDA stream to synchronize operation + */ template void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, cudaStream_t stream) const { - allgather(static_cast(sendbuff), static_cast(recvbuff), + impl_->allgather(static_cast(sendbuff), static_cast(recvbuff), sendcount, get_type(), stream); } - void allgatherv(const void* sendbuf, void* recvbuf, const size_t recvcounts[], - const int displs[], datatype_t datatype, - cudaStream_t stream) const { - - impl_->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); - } - + /** + * Gathers data from all ranks and delivers to combined data to all ranks + * @param value_t datatype of underlying buffers + * @param sendbuff buffer containing data to send + * @param recvbuff buffer containing data to receive + * @param recvcounts array (of length num_ranks size) containing the number of elements + * that are to be received from each rank + * @param displs array (of length num_ranks size) to specify the displacement (relative to recvbuf) + * at which to place the incoming data from each rank + * @param stream CUDA stream to synchronize operation + */ template void allgatherv(const value_t* sendbuf, value_t* recvbuf, const size_t recvcounts[], const int displs[], cudaStream_t stream) const { - allgatherv(static_cast(sendbuf), static_cast(recvbuf), recvcounts, displs, get_type(), stream); - } - - void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - - impl_->reducescatter(sendbuff, recvbuff, - recvcount, datatype , op, stream); + impl_->allgatherv(static_cast(sendbuf), static_cast(recvbuf), recvcounts, displs, get_type(), stream); } + /** + * Reduces data from all ranks then scatters the result across ranks + * @tparam value_t datatype of underlying buffers + * @param sendbuff buffer containing data to send (size recvcount * num_ranks) + * @param recvbuff buffer containing received data + * @param op reduction operation to perform + * @param stream CUDA stream to synchronize operation + */ template void reducescatter(const value_t* sendbuff, value_t* recvbuff, size_t recvcount, op_t op, cudaStream_t stream) const { - reducescatter(static_cast(sendbuff), static_cast(recvbuff), + impl_->reducescatter(static_cast(sendbuff), static_cast(recvbuff), recvcount, get_type() , op, stream); } diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 4edfc4b6c7..d762785fef 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -27,11 +27,16 @@ namespace comms { /** * Function to construct comms_t and inject it on a handle_t. This * is used for convenience in the Python layer. + * + * @param handle raft::handle_t for injecting the comms + * @param nccl_comm initialized NCCL communicator to use for collectives + * @param num_ranks number of ranks in communicator clique + * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, +void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, int rank) { auto d_alloc = handle->get_device_allocator(); - raft::comms::comms_iface *raft_comm = new raft::comms::std_comms(comm, size, rank, d_alloc); + raft::comms::comms_iface *raft_comm = new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc); std::cout << "Comms: " << raft_comm->getSize() << std::endl; auto communicator = @@ -42,14 +47,25 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t comm, int size, /** * Function to construct comms_t and inject it on a handle_t. This * is used for convenience in the Python layer. + * + * @param handle raft::handle_t for injecting the comms + * @param nccl_comm initialized NCCL communicator to use for collectives + * @param ucp_worker of local process + * Note: This is purposefully left as void* so that the ucp_worker_h + * doesn't need to be exposed through the cython layer + * @param eps array of ucp_ep_h instances. + * Note: This is purposefully left as void* so that + * the ucp_ep_h doesn't need to be exposed through the cython layer. + * @param num_ranks number of ranks in communicator clique + * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, - void *eps, int size, int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[size]); +void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, void *ucp_worker, + void *eps, int num_ranks, int rank) { + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); auto size_t_ep_arr = reinterpret_cast(eps); - for (int i = 0; i < size; i++) { + for (int i = 0; i < num_ranks; i++) { size_t ptr = size_t_ep_arr[i]; auto ucp_ep_v = reinterpret_cast(*eps_sp); @@ -62,15 +78,20 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, void *ucp_worker, } auto d_alloc = handle->get_device_allocator(); - auto *raft_comm = new raft::comms::std_comms(comm, (ucp_worker_h)ucp_worker, - eps_sp, size, rank, d_alloc); - std::cout << "Comms: " << raft_comm << std::endl; + auto *raft_comm = new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, + eps_sp, num_ranks, rank, d_alloc); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } -bool test_collective_allreduce(const handle_t &handle) { +/** + * A simple sanity check that NCCL is able to perform a collective operation + * + * @param the raft handle to use. This is expected to already have an + * initialized comms instance. + */ +bool test_collective_allreduce(const handle_t &handle, int root) { const comms_t &communicator = handle.get_comms(); const int send = 1; @@ -82,7 +103,7 @@ bool test_collective_allreduce(const handle_t &handle) { CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, - datatype_t::INT32, op_t::SUM, stream); + op_t::SUM, stream); int temp_h = 0; CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); @@ -95,6 +116,139 @@ bool test_collective_allreduce(const handle_t &handle) { return temp_h == communicator.getSize(); } + +/** + * A simple sanity check that NCCL is able to perform a collective operation + * + * @param the raft handle to use. This is expected to already have an + * initialized comms instance. + */ +bool test_collective_broadcast(const handle_t &handle, int root) { + const comms_t &communicator = handle.get_comms(); + + const int send = root; + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(1, stream); + + if(communicator.getRank() == root) + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + + communicator.bcast(temp_d.data(), 1, root, stream); + communicator.syncStream(stream); + int temp_h = -1; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + communicator.barrier(); + + std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "final_size: " << temp_h << std::endl; + + return temp_h == root; +} + +bool test_collective_reduce(const handle_t &handle, int root) { + const comms_t &communicator = handle.get_comms(); + + const int send = root; + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(1, stream); + + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + + communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); + communicator.syncStream(stream); + int temp_h = -1; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + communicator.barrier(); + + std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "final_size: " << temp_h << std::endl; + + if(communicator.getRank() == root) + return temp_h == root * communicator.getSize(); + else + return true; +} + + +bool test_collective_allgather(const handle_t &handle, int root) { + const comms_t &communicator = handle.get_comms(); + + const int send = root; + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(1, stream); + + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, communicator.getSize()); + + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + + communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); + communicator.syncStream(stream); + int temp_h[communicator.getSize()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int)*communicator.getSize(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + communicator.barrier(); + + std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "final_size: " << temp_h << std::endl; + + for(int i = 0; i < communicator.getSize(); i++) + if(temp_h[i] != i) + return false; + return true; +} + +bool test_collective_reducescatter(const handle_t &handle, int root) { + const comms_t &communicator = handle.get_comms(); + + const int send = 1; + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, 1); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); + + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); + communicator.syncStream(stream); + int temp_h = -1; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + communicator.barrier(); + + std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "final_size: " << temp_h << std::endl; + + return temp_h = communicator.getSize(); +} + + +/** + * A simple sanity check that UCX is able to send messages between all ranks + * + * @param the raft handle to use. This is expected to already have an + * initialized comms instance. + * @param number of iterations of all-to-all messaging to perform + */ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { const comms_t &communicator = h.get_comms(); const int rank = communicator.getRank(); diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index e4e59115f1..ecd589a32c 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -92,6 +92,7 @@ ncclDataType_t get_nccl_datatype(const datatype_t datatype) { case datatype_t::UINT8: return ncclUint8; case datatype_t::INT32: + std::cout << "Returning int32" << std::endl; return ncclInt; case datatype_t::UINT32: return ncclUint32; diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index 9b85a32800..37981b70fb 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -20,4 +20,7 @@ from raft.dask.common.comms_utils import inject_comms_on_handle_coll_only from raft.dask.common.comms_utils import perform_test_comms_allreduce from raft.dask.common.comms_utils import perform_test_comms_send_recv - +from raft.dask.common.comms_utils import perform_test_comms_allgather +from raft.dask.common.comms_utils import perform_test_comms_bcast +from raft.dask.common.comms_utils import perform_test_comms_reduce +from raft.dask.common.comms_utils import perform_test_comms_reducescatter diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 1825d385a7..8272446529 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -55,12 +55,16 @@ cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": int size, int rank) except + - bool test_collective_allreduce(const handle_t &h) except + + bool test_collective_allreduce(const handle_t &h, int root) except + + bool test_collective_broadcast(const handle_t &h, int root) except + + bool test_collective_reduce(const handle_t &h, int root) except + + bool test_collective_allgather(const handle_t &h, int root) except + + bool test_collective_reducescatter(const handle_t &h, int root) except + bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) except + -def perform_test_comms_allreduce(handle): +def perform_test_comms_allreduce(handle, root): """ Performs an allreduce on the current worker @@ -70,7 +74,59 @@ def perform_test_comms_allreduce(handle): handle containing comms_t to use """ cdef const handle_t* h = handle.getHandle() - return test_collective_allreduce(deref(h)) + return test_collective_allreduce(deref(h), root) + + +def perform_test_comms_reduce(handle, root): + """ + Performs an allreduce on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_reduce(deref(h), root) + + +def perform_test_comms_reducescatter(handle, root): + """ + Performs an allreduce on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_reducescatter(deref(h), root) + + +def perform_test_comms_bcast(handle, root): + """ + Performs an broadcast on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_broadcast(deref(h), root) + + +def perform_test_comms_allgather(handle, root): + """ + Performs an broadcast on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_allgather(deref(h), root) def perform_test_comms_send_recv(handle, n_trials): diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index 9fb735c361..d3a6c20ffc 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -22,6 +22,10 @@ from raft.dask.common import local_handle from raft.dask.common import perform_test_comms_send_recv from raft.dask.common import perform_test_comms_allreduce +from raft.dask.common import perform_test_comms_bcast +from raft.dask.common import perform_test_comms_reduce +from raft.dask.common import perform_test_comms_allgather +from raft.dask.common import perform_test_comms_reducescatter pytestmark = pytest.mark.mg @@ -43,9 +47,9 @@ def test_comms_init_no_p2p(cluster): client.close() -def func_test_allreduce(sessionId): +def func_test_collective(func, sessionId, root): handle = local_handle(sessionId) - return perform_test_comms_allreduce(handle) + return func(handle, root) def func_test_send_recv(sessionId, n_trials): @@ -79,7 +83,12 @@ def _has_handle(sessionId): @pytest.mark.nccl -def test_allreduce(cluster): +@pytest.mark.parametrize("func", [perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_reduce, + perform_test_comms_reducescatter]) +def test_collectives(cluster, func): client = Client(cluster) @@ -87,12 +96,16 @@ def test_allreduce(cluster): cb = Comms() cb.init() - dfs = [client.submit(func_test_allreduce, - cb.sessionId, - pure=False, - workers=[w]) - for w in cb.worker_addresses] - wait(dfs, timeout=5) + for k, v in cb.worker_info(cb.worker_addresses).items(): + + dfs = [client.submit(func_test_collective, + perform_test_comms_allreduce, + cb.sessionId, + v["rank"], + pure=False, + workers=[w]) + for w in cb.worker_addresses] + wait(dfs, timeout=5) assert all([x.result() for x in dfs]) From 488d0d5866df9df1eb2845c359d8e223abd81cda Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 14:27:30 -0400 Subject: [PATCH 035/189] Running cpp style --- cpp/include/raft/comms/comms.hpp | 83 ++++++++++++------------ cpp/include/raft/comms/comms_helper.hpp | 84 +++++++++++++------------ 2 files changed, 87 insertions(+), 80 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 5861e159ab..5c47316670 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -43,7 +43,6 @@ enum class status_t { commStatusAbort // A failure occurred in sync, queued operations aborted }; - class comms_iface { public: virtual ~comms_iface(); @@ -57,30 +56,30 @@ class comms_iface { virtual status_t syncStream(cudaStream_t stream) const = 0; virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const=0; + request_t* request) const = 0; virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, - cudaStream_t stream) const = 0; + cudaStream_t stream) const = 0; virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; + datatype_t datatype, cudaStream_t stream) const = 0; virtual void allgatherv(const void* sendbuf, void* recvbuf, const size_t recvcounts[], const int displs[], - datatype_t datatype, cudaStream_t stream) const = 0; + datatype_t datatype, cudaStream_t stream) const = 0; virtual void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, datatype_t datatype, op_t op, @@ -102,7 +101,6 @@ class comms_t { int getSize() const { return impl_->getSize(); } - /** * Returns the local rank */ @@ -146,10 +144,11 @@ class comms_t { * @param request pointer to hold returned request_t object. * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ - template + template void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, + tag, request); } /** @@ -162,10 +161,11 @@ class comms_t { * @param request pointer to hold returned request_t object. * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ - template + template void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, + request); } /** @@ -174,7 +174,7 @@ class comms_t { * @param array_of_requests an array of request_t objects returned from isend/irecv */ void waitall(int count, request_t array_of_requests[]) const { - impl_->waitall(count, array_of_requests); + impl_->waitall(count, array_of_requests); } /** @@ -186,11 +186,12 @@ class comms_t { * @param op reduction operation to perform * @param stream CUDA stream to synchronize operation */ - template + template void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const { - impl_->allreduce(static_cast(sendbuff), static_cast(recvbuff) - , count, get_type(), op, stream); + impl_->allreduce(static_cast(sendbuff), + static_cast(recvbuff), count, get_type(), + op, stream); } /** @@ -201,10 +202,10 @@ class comms_t { * @param root the rank initiating the broadcast * @param stream CUDA stream to synchronize operation */ - template - void bcast(value_t* buff, size_t count, int root, - cudaStream_t stream) const { - impl_->bcast(static_cast(buff), count, get_type(), root, stream); + template + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { + impl_->bcast(static_cast(buff), count, get_type(), root, + stream); } /** @@ -217,12 +218,12 @@ class comms_t { * @param root rank to store the results * @param stream CUDA stream to synchronize operation */ - template - void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, - op_t op, int root, - cudaStream_t stream) const { - impl_->reduce(static_cast(sendbuff), static_cast(recvbuff), - count, get_type(), op, root, stream); + template + void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, + int root, cudaStream_t stream) const { + impl_->reduce(static_cast(sendbuff), + static_cast(recvbuff), count, get_type(), op, + root, stream); } /** @@ -233,11 +234,12 @@ class comms_t { * @param sendcount number of elements in send buffer * @param stream CUDA stream to synchronize operation */ - template + template void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, cudaStream_t stream) const { - impl_->allgather(static_cast(sendbuff), static_cast(recvbuff), - sendcount, get_type(), stream); + impl_->allgather(static_cast(sendbuff), + static_cast(recvbuff), sendcount, + get_type(), stream); } /** @@ -251,11 +253,13 @@ class comms_t { * at which to place the incoming data from each rank * @param stream CUDA stream to synchronize operation */ - template - void allgatherv(const value_t* sendbuf, value_t* recvbuf, const size_t recvcounts[], - const int displs[], + template + void allgatherv(const value_t* sendbuf, value_t* recvbuf, + const size_t recvcounts[], const int displs[], cudaStream_t stream) const { - impl_->allgatherv(static_cast(sendbuf), static_cast(recvbuf), recvcounts, displs, get_type(), stream); + impl_->allgatherv(static_cast(sendbuf), + static_cast(recvbuf), recvcounts, displs, + get_type(), stream); } /** @@ -266,11 +270,12 @@ class comms_t { * @param op reduction operation to perform * @param stream CUDA stream to synchronize operation */ - template - void reducescatter(const value_t* sendbuff, value_t* recvbuff, size_t recvcount, - op_t op, cudaStream_t stream) const { - impl_->reducescatter(static_cast(sendbuff), static_cast(recvbuff), - recvcount, get_type() , op, stream); + template + void reducescatter(const value_t* sendbuff, value_t* recvbuff, + size_t recvcount, op_t op, cudaStream_t stream) const { + impl_->reducescatter(static_cast(sendbuff), + static_cast(recvbuff), recvcount, + get_type(), op, stream); } private: diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index d762785fef..2838929f8f 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -33,10 +33,11 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, - int rank) { +void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, + int num_ranks, int rank) { auto d_alloc = handle->get_device_allocator(); - raft::comms::comms_iface *raft_comm = new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc); + raft::comms::comms_iface *raft_comm = + new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc); std::cout << "Comms: " << raft_comm->getSize() << std::endl; auto communicator = @@ -59,8 +60,9 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, void *ucp_worker, - void *eps, int num_ranks, int rank) { +void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, + void *ucp_worker, void *eps, int num_ranks, + int rank) { auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); auto size_t_ep_arr = reinterpret_cast(eps); @@ -78,8 +80,8 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, void *ucp_work } auto d_alloc = handle->get_device_allocator(); - auto *raft_comm = new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, - eps_sp, num_ranks, rank, d_alloc); + auto *raft_comm = new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); @@ -100,13 +102,12 @@ bool test_collective_allreduce(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, - cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, - op_t::SUM, stream); + CUDA_CHECK( + cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK( + cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -116,7 +117,6 @@ bool test_collective_allreduce(const handle_t &handle, int root) { return temp_h == communicator.getSize(); } - /** * A simple sanity check that NCCL is able to perform a collective operation * @@ -133,13 +133,13 @@ bool test_collective_broadcast(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - if(communicator.getRank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + if (communicator.getRank() == root) + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.syncStream(stream); - int temp_h = -1; // Verify more than one byte is being sent + int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -162,11 +162,11 @@ bool test_collective_reduce(const handle_t &handle, int root) { temp_d.resize(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.syncStream(stream); - int temp_h = -1; // Verify more than one byte is being sent + int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -175,13 +175,12 @@ bool test_collective_reduce(const handle_t &handle, int root) { std::cout << "Clique size: " << communicator.getSize() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - if(communicator.getRank() == root) - return temp_h == root * communicator.getSize(); + if (communicator.getRank() == root) + return temp_h == root * communicator.getSize(); else - return true; + return true; } - bool test_collective_allgather(const handle_t &handle, int root) { const comms_t &communicator = handle.get_comms(); @@ -192,15 +191,18 @@ bool test_collective_allgather(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, communicator.getSize()); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, + communicator.getSize()); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.syncStream(stream); - int temp_h[communicator.getSize()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int)*communicator.getSize(), + int + temp_h[communicator.getSize()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), + sizeof(int) * communicator.getSize(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -208,9 +210,8 @@ bool test_collective_allgather(const handle_t &handle, int root) { std::cout << "Clique size: " << communicator.getSize() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - for(int i = 0; i < communicator.getSize(); i++) - if(temp_h[i] != i) - return false; + for (int i = 0; i < communicator.getSize(); i++) + if (temp_h[i] != i) return false; return true; } @@ -221,15 +222,18 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, 1); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, + 1); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, + 1); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, + stream); communicator.syncStream(stream); - int temp_h = -1; // Verify more than one byte is being sent + int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -241,7 +245,6 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { return temp_h = communicator.getSize(); } - /** * A simple sanity check that UCX is able to send messages between all ranks * @@ -263,16 +266,15 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { //post receives for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, - 0, requests.data() + request_idx); + communicator.irecv(received_data.data() + request_idx, 1, r, 0, + requests.data() + request_idx); ++request_idx; } } for (int r = 0; r < communicator.getSize(); ++r) { if (r != rank) { - communicator.isend(&rank, 1, r, 0, - requests.data() + request_idx); + communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); ++request_idx; } } From 3d362d00834eec17b4dfd9db3ec28287d4b3f269 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 14:37:29 -0400 Subject: [PATCH 036/189] Updating tabbing for pytests --- python/raft/test/test_comms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index d3a6c20ffc..b065d098af 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -107,7 +107,7 @@ def test_collectives(cluster, func): for w in cb.worker_addresses] wait(dfs, timeout=5) - assert all([x.result() for x in dfs]) + assert all([x.result() for x in dfs]) finally: cb.destroy() From 417a4bd43d92a71e74d0fc2b7b522c3c880d380e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 15:53:07 -0400 Subject: [PATCH 037/189] Following clang tidy standards --- cpp/include/raft/comms/comms.hpp | 20 ++++----- cpp/include/raft/comms/comms_helper.hpp | 58 ++++++++++++------------- cpp/include/raft/comms/std_comms.hpp | 12 ++--- cpp/include/raft/handle.hpp | 10 ++--- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 5c47316670..93b4dab3dd 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -47,13 +47,13 @@ class comms_iface { public: virtual ~comms_iface(); - virtual int getSize() const = 0; - virtual int getRank() const = 0; + virtual int get_size() const = 0; + virtual int get_rank() const = 0; - virtual std::unique_ptr commSplit(int color, int key) const = 0; + virtual std::unique_ptr comm_split(int color, int key) const = 0; virtual void barrier() const = 0; - virtual status_t syncStream(cudaStream_t stream) const = 0; + virtual status_t sync_stream(cudaStream_t stream) const = 0; virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; @@ -99,12 +99,12 @@ class comms_t { * Returns the size of the communicator clique */ - int getSize() const { return impl_->getSize(); } + int get_size() const { return impl_->get_size(); } /** * Returns the local rank */ - int getRank() const { return impl_->getRank(); } + int get_rank() const { return impl_->get_rank(); } /** * Splits the current communicator clique into sub-cliques matching @@ -113,8 +113,8 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr commSplit(int color, int key) const { - return impl_->commSplit(color, key); + std::unique_ptr comm_split(int color, int key) const { + return impl_->comm_split(color, key); } /** @@ -130,8 +130,8 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t syncStream(cudaStream_t stream) const { - return impl_->syncStream(stream); + status_t sync_stream(cudaStream_t stream) const { + return impl_->sync_stream(stream); } /** diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/comms_helper.hpp index 2838929f8f..6e6ba942cc 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/comms_helper.hpp @@ -38,7 +38,7 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, auto d_alloc = handle->get_device_allocator(); raft::comms::comms_iface *raft_comm = new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc); - std::cout << "Comms: " << raft_comm->getSize() << std::endl; + std::cout << "Comms: " << raft_comm->get_size() << std::endl; auto communicator = std::make_shared(std::unique_ptr(raft_comm)); @@ -111,10 +111,10 @@ bool test_collective_allreduce(const handle_t &handle, int root) { CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); - std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "Clique size: " << communicator.get_size() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - return temp_h == communicator.getSize(); + return temp_h == communicator.get_size(); } /** @@ -133,19 +133,19 @@ bool test_collective_broadcast(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - if (communicator.getRank() == root) + if (communicator.get_rank() == root) CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); - communicator.syncStream(stream); + communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); - std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "Clique size: " << communicator.get_size() << std::endl; std::cout << "final_size: " << temp_h << std::endl; return temp_h == root; @@ -165,18 +165,18 @@ bool test_collective_reduce(const handle_t &handle, int root) { cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); - communicator.syncStream(stream); + communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); - std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "Clique size: " << communicator.get_size() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - if (communicator.getRank() == root) - return temp_h == root * communicator.getSize(); + if (communicator.get_rank() == root) + return temp_h == root * communicator.get_size(); else return true; } @@ -192,25 +192,25 @@ bool test_collective_allgather(const handle_t &handle, int root) { temp_d.resize(1, stream); raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.getSize()); + communicator.get_size()); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); - communicator.syncStream(stream); + communicator.sync_stream(stream); int - temp_h[communicator.getSize()]; // Verify more than one byte is being sent + temp_h[communicator.get_size()]; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), - sizeof(int) * communicator.getSize(), + sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); - std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "Clique size: " << communicator.get_size() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - for (int i = 0; i < communicator.getSize(); i++) + for (int i = 0; i < communicator.get_size(); i++) if (temp_h[i] != i) return false; return true; } @@ -232,17 +232,17 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); - communicator.syncStream(stream); + communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); - std::cout << "Clique size: " << communicator.getSize() << std::endl; + std::cout << "Clique size: " << communicator.get_size() << std::endl; std::cout << "final_size: " << temp_h << std::endl; - return temp_h = communicator.getSize(); + return temp_h = communicator.get_size(); } /** @@ -254,17 +254,17 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { */ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { const comms_t &communicator = h.get_comms(); - const int rank = communicator.getRank(); + const int rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { - std::vector received_data((communicator.getSize() - 1), -1); + std::vector received_data((communicator.get_size() - 1), -1); std::vector requests; - requests.resize(2 * (communicator.getSize() - 1)); + requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; //post receives - for (int r = 0; r < communicator.getSize(); ++r) { + for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { communicator.irecv(received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); @@ -272,7 +272,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { } } - for (int r = 0; r < communicator.getSize(); ++r) { + for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); ++request_idx; @@ -282,14 +282,14 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.waitall(requests.size(), requests.data()); communicator.barrier(); - if (communicator.getRank() == 0) { + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; std::cout << "Trial " << i << std::endl; } - for (int printrank = 0; printrank < communicator.getSize(); ++printrank) { - if (communicator.getRank() == printrank) { - std::cout << "Rank " << communicator.getRank() << " received: ["; + for (int printrank = 0; printrank < communicator.get_size(); ++printrank) { + if (communicator.get_rank() == printrank) { + std::cout << "Rank " << communicator.get_rank() << " received: ["; for (int i = 0; i < received_data.size(); i++) { auto rec = received_data[i]; std::cout << rec; @@ -303,7 +303,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.barrier(); } - if (communicator.getRank() == 0) + if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index ecd589a32c..c20bc76bf6 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -176,11 +176,11 @@ class std_comms : public comms_iface { device_allocator_->allocate(sizeof(int), stream_)); } - int getSize() const { return num_ranks_; } + int get_size() const { return num_ranks_; } - int getRank() const { return rank_; } + int get_rank() const { return rank_; } - std::unique_ptr commSplit(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const { // Not supported by NCCL ASSERT(false, "ERROR: commSplit called but not yet supported in this comms " @@ -193,7 +193,7 @@ class std_comms : public comms_iface { allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_); - ASSERT(syncStream(stream_) == status_t::commStatusSuccess, + ASSERT(sync_stream(stream_) == status_t::commStatusSuccess, "ERROR: syncStream failed. This can be caused by a failed rank_."); } @@ -221,7 +221,7 @@ class std_comms : public comms_iface { ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, getRank()); + default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } @@ -362,7 +362,7 @@ class std_comms : public comms_iface { nccl_comm_, stream)); } - status_t syncStream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index f8db324d90..01d6d7a83f 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -150,16 +150,16 @@ class handle_t { } void set_comms(std::shared_ptr communicator) { - _communicator = communicator; + communicator_ = communicator; } const comms::comms_t& get_comms() const { - ASSERT(nullptr != _communicator.get(), + ASSERT(nullptr != communicator_.get(), "ERROR: Communicator was not initialized\n"); - return *_communicator; + return *communicator_; } - bool comms_initialized() const { return (nullptr != _communicator.get()); } + bool comms_initialized() const { return (nullptr != communicator_.get()); } const cudaDeviceProp& get_device_properties() const { std::lock_guard _(mutex_); @@ -171,7 +171,7 @@ class handle_t { } private: - std::shared_ptr _communicator; + std::shared_ptr communicator_; const int dev_id_; const int num_streams_; From cb92349f2cfb993b43c4884356b227b10493f04b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 19 May 2020 18:51:38 -0400 Subject: [PATCH 038/189] Moving get_type out of comms_t --- cpp/include/raft/comms/comms.hpp | 86 +++++++++++++++++--------------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 93b4dab3dd..42827ef361 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -43,6 +43,51 @@ enum class status_t { commStatusAbort // A failure occurred in sync, queued operations aborted }; +template +constexpr datatype_t get_type(); + + +template <> +constexpr datatype_t get_type() { + return datatype_t::CHAR; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::UINT8; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::INT32; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::UINT32; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::INT64; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::UINT64; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::FLOAT32; +} + +template <> +constexpr datatype_t get_type() { + return datatype_t::FLOAT64; +} + + class comms_iface { public: virtual ~comms_iface(); @@ -92,9 +137,6 @@ class comms_t { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } - template - datatype_t get_type() const; - /** * Returns the size of the communicator clique */ @@ -284,44 +326,6 @@ class comms_t { comms_iface::~comms_iface() {} -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::CHAR; -} -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::UINT8; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::INT32; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::UINT32; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::INT64; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::UINT64; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::FLOAT32; -} - -template <> -constexpr datatype_t comms_t::get_type() const { - return datatype_t::FLOAT64; -} } // namespace comms } // namespace raft From 6e9025d03f67e96b80989f5e3b6c41cc1932f08b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 26 May 2020 15:42:52 -0400 Subject: [PATCH 039/189] More review feedback --- cpp/CMakeLists.txt | 4 +- .../CMakeLists.txt => cmake/comms.cmake} | 2 +- cpp/include/raft/comms/comms.hpp | 8 +- cpp/include/raft/comms/helper.hpp | 104 ++++++++++++++++++ cpp/include/raft/comms/nccl_helper.hpp | 34 ------ cpp/include/raft/comms/std_comms.hpp | 28 ++--- .../raft/comms/{comms_helper.hpp => test.hpp} | 68 +----------- python/raft/common/__init__.py | 4 +- python/raft/dask/__init__.py | 2 +- python/raft/dask/common/__init__.py | 20 ++-- python/raft/dask/common/comms_utils.pyx | 4 +- python/raft/dask/common/nccl.pyx | 10 +- python/raft/test/conftest.py | 4 - 13 files changed, 149 insertions(+), 143 deletions(-) rename cpp/{include/raft/comms/CMakeLists.txt => cmake/comms.cmake} (95%) create mode 100644 cpp/include/raft/comms/helper.hpp delete mode 100644 cpp/include/raft/comms/nccl_helper.hpp rename cpp/include/raft/comms/{comms_helper.hpp => test.hpp} (77%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f032e3a7e4..a45cf8c950 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -67,6 +67,8 @@ find_package(CUDA 10.0 REQUIRED) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +message("HELLO!") + ############################################################################## # - Compiler Options -------------------------------------------------------- @@ -161,7 +163,7 @@ set(CMAKE_CUDA_FLAGS # - dependencies ------------------------------------------------------------- include(cmake/Dependencies.cmake) -add_subdirectory(include/raft/comms) +include(cmake/comms.cmake) ############################################################################## # - include paths ------------------------------------------------------------ diff --git a/cpp/include/raft/comms/CMakeLists.txt b/cpp/cmake/comms.cmake similarity index 95% rename from cpp/include/raft/comms/CMakeLists.txt rename to cpp/cmake/comms.cmake index 734ce11812..c8496c7dc6 100644 --- a/cpp/include/raft/comms/CMakeLists.txt +++ b/cpp/cmake/comms.cmake @@ -34,4 +34,4 @@ find_package(UCX) include_directories(${UCX_INCLUDE_DIRS}) include_directories( ${NCCL_INCLUDE_DIRS} ) -list(APPEND RAFT_LINK_LIBRARIES ${NCCL_LIBRARIES}) \ No newline at end of file +list(APPEND RAFT_LINK_LIBRARIES ${NCCL_LIBRARIES}) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 42827ef361..8f675ddfe4 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include namespace raft { namespace comms { @@ -38,15 +39,14 @@ enum class op_t { SUM, PROD, MIN, MAX }; * The resulting status of distributed stream synchronization */ enum class status_t { - commStatusSuccess, // Synchronization successful - commStatusError, // An error occured querying sync status - commStatusAbort // A failure occurred in sync, queued operations aborted + SUCCESS, // Synchronization successful + ERROR, // An error occured querying sync status + ABORT // A failure occurred in sync, queued operations aborted }; template constexpr datatype_t get_type(); - template <> constexpr datatype_t get_type() { return datatype_t::CHAR; diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp new file mode 100644 index 0000000000..67ba4cab0a --- /dev/null +++ b/cpp/include/raft/comms/helper.hpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace comms { + +/** + * Function to construct comms_t and inject it on a handle_t. This + * is used for convenience in the Python layer. + * + * @param handle raft::handle_t for injecting the comms + * @param nccl_comm initialized NCCL communicator to use for collectives + * @param num_ranks number of ranks in communicator clique + * @param rank rank of local instance + */ +void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, + int num_ranks, int rank) { + auto d_alloc = handle->get_device_allocator(); + cudaStream_t stream = handle->get_stream(); + comms_iface *raft_comm = new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream); + + auto communicator = + std::make_shared(std::unique_ptr(raft_comm)); + handle->set_comms(communicator); +} + +/** + * Function to construct comms_t and inject it on a handle_t. This + * is used for convenience in the Python layer. + * + * @param handle raft::handle_t for injecting the comms + * @param nccl_comm initialized NCCL communicator to use for collectives + * @param ucp_worker of local process + * Note: This is purposefully left as void* so that the ucp_worker_h + * doesn't need to be exposed through the cython layer + * @param eps array of ucp_ep_h instances. + * Note: This is purposefully left as void* so that + * the ucp_ep_h doesn't need to be exposed through the cython layer. + * @param num_ranks number of ranks in communicator clique + * @param rank rank of local instance + */ +void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, + void *ucp_worker, void *eps, int num_ranks, + int rank) { + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); + + auto size_t_ep_arr = reinterpret_cast(eps); + + for (int i = 0; i < num_ranks; i++) { + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); + + if (ptr != 0) { + auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); + ucp_ep_v[i] = eps_ptr; + } else { + ucp_ep_v[i] = nullptr; + } + } + + auto d_alloc = handle->get_device_allocator(); + cudaStream_t stream = handle->get_stream(); + + auto *raft_comm = new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream); + auto communicator = + std::make_shared(std::unique_ptr(raft_comm)); + handle->set_comms(communicator); +} + +inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, int size) { + memcpy(id->internal, uniqueId, size); +} + +inline void get_unique_id(char *uid, int size) { + ncclUniqueId id; + ncclGetUniqueId(&id); + + memcpy(uid, id.internal, size); +} +}; // namespace comms +}; // end namespace raft diff --git a/cpp/include/raft/comms/nccl_helper.hpp b/cpp/include/raft/comms/nccl_helper.hpp deleted file mode 100644 index d7a14ba8ba..0000000000 --- a/cpp/include/raft/comms/nccl_helper.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace raft { -namespace comms { -inline void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { - memcpy(id->internal, uniqueId, size); -} - -inline void get_unique_id(char *uid, int size) { - ncclUniqueId id; - ncclGetUniqueId(&id); - - memcpy(uid, id.internal, size); -} -} // namespace comms -} // namespace raft diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index c20bc76bf6..ef83730749 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -85,7 +85,7 @@ constexpr size_t get_datatype_size(const datatype_t datatype) { } } -ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return ncclChar; @@ -107,7 +107,7 @@ ncclDataType_t get_nccl_datatype(const datatype_t datatype) { } } -ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { case op_t::SUM: return ncclSum; @@ -134,13 +134,15 @@ class std_comms : public comms_iface { */ std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int num_ranks, int rank, - const std::shared_ptr device_allocator) + const std::shared_ptr device_allocator, + cudaStream_t stream) : nccl_comm_(nccl_comm), ucp_worker_(ucp_worker), ucp_eps_(eps), num_ranks_(num_ranks), rank_(rank), device_allocator_(device_allocator), + stream_(stream), next_request_id_(0) { initialize(); }; @@ -152,24 +154,22 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - const std::shared_ptr device_allocator) + const std::shared_ptr device_allocator, + cudaStream_t stream) : nccl_comm_(nccl_comm), num_ranks_(num_ranks), rank_(rank), - device_allocator_(device_allocator) { + device_allocator_(device_allocator), + stream_(stream) { initialize(); }; virtual ~std_comms() { - CUDA_CHECK_NO_THROW(cudaStreamDestroy(stream_)); - device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); } void initialize() { - CUDA_CHECK(cudaStreamCreate(&stream_)); - sendbuff_ = reinterpret_cast( device_allocator_->allocate(sizeof(int), stream_)); recvbuff_ = reinterpret_cast( @@ -193,7 +193,7 @@ class std_comms : public comms_iface { allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_); - ASSERT(sync_stream(stream_) == status_t::commStatusSuccess, + ASSERT(sync_stream(stream_) == status_t::SUCCESS, "ERROR: syncStream failed. This can be caused by a failed rank_."); } @@ -367,17 +367,17 @@ class std_comms : public comms_iface { ncclResult_t ncclErr, ncclAsyncErr; while (1) { cudaErr = cudaStreamQuery(stream); - if (cudaErr == cudaSuccess) return status_t::commStatusSuccess; + if (cudaErr == cudaSuccess) return status_t::SUCCESS; if (cudaErr != cudaErrorNotReady) { // An error occurred querying the status of the stream_ - return status_t::commStatusError; + return status_t::ERROR; } ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr); if (ncclErr != ncclSuccess) { // An error occurred retrieving the asynchronous error - return status_t::commStatusError; + return status_t::ERROR; } if (ncclAsyncErr != ncclSuccess) { @@ -386,7 +386,7 @@ class std_comms : public comms_iface { ncclErr = ncclCommAbort(nccl_comm_); if (ncclErr != ncclSuccess) // Caller may abort with an exception or try to re-create a new communicator. - return status_t::commStatusAbort; + return status_t::ABORT; } // Let other threads (including NCCL threads) use the CPU. diff --git a/cpp/include/raft/comms/comms_helper.hpp b/cpp/include/raft/comms/test.hpp similarity index 77% rename from cpp/include/raft/comms/comms_helper.hpp rename to cpp/include/raft/comms/test.hpp index 6e6ba942cc..a3bf81a007 100644 --- a/cpp/include/raft/comms/comms_helper.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -24,69 +24,6 @@ namespace raft { namespace comms { -/** - * Function to construct comms_t and inject it on a handle_t. This - * is used for convenience in the Python layer. - * - * @param handle raft::handle_t for injecting the comms - * @param nccl_comm initialized NCCL communicator to use for collectives - * @param num_ranks number of ranks in communicator clique - * @param rank rank of local instance - */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, - int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); - raft::comms::comms_iface *raft_comm = - new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc); - std::cout << "Comms: " << raft_comm->get_size() << std::endl; - - auto communicator = - std::make_shared(std::unique_ptr(raft_comm)); - handle->set_comms(communicator); -} - -/** - * Function to construct comms_t and inject it on a handle_t. This - * is used for convenience in the Python layer. - * - * @param handle raft::handle_t for injecting the comms - * @param nccl_comm initialized NCCL communicator to use for collectives - * @param ucp_worker of local process - * Note: This is purposefully left as void* so that the ucp_worker_h - * doesn't need to be exposed through the cython layer - * @param eps array of ucp_ep_h instances. - * Note: This is purposefully left as void* so that - * the ucp_ep_h doesn't need to be exposed through the cython layer. - * @param num_ranks number of ranks in communicator clique - * @param rank rank of local instance - */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, - void *ucp_worker, void *eps, int num_ranks, - int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - - auto size_t_ep_arr = reinterpret_cast(eps); - - for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); - - if (ptr != 0) { - auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; - } else { - ucp_ep_v[i] = nullptr; - } - } - - auto d_alloc = handle->get_device_allocator(); - auto *raft_comm = new raft::comms::std_comms( - nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc); - auto communicator = - std::make_shared(std::unique_ptr(raft_comm)); - handle->set_comms(communicator); -} - /** * A simple sanity check that NCCL is able to perform a collective operation * @@ -309,6 +246,5 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { return ret; } - -}; // namespace comms -}; // end namespace raft +} +}; diff --git a/python/raft/common/__init__.py b/python/raft/common/__init__.py index ac84a7a93a..b5ef2b3079 100644 --- a/python/raft/common/__init__.py +++ b/python/raft/common/__init__.py @@ -13,5 +13,5 @@ # limitations under the License. # -from raft.common.cuda import Stream -from raft.common.handle import Handle \ No newline at end of file +from .cuda import Stream +from .handle import Handle \ No newline at end of file diff --git a/python/raft/dask/__init__.py b/python/raft/dask/__init__.py index e58ac25f47..74231d256f 100644 --- a/python/raft/dask/__init__.py +++ b/python/raft/dask/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. # -from raft.dask.common.comms import Comms \ No newline at end of file +from .common.comms import Comms \ No newline at end of file diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index 37981b70fb..1aed648e4d 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -13,14 +13,14 @@ # limitations under the License. # -from raft.dask.common.comms import Comms -from raft.dask.common.comms import local_handle +from .comms import Comms +from .comms import local_handle -from raft.dask.common.comms_utils import inject_comms_on_handle -from raft.dask.common.comms_utils import inject_comms_on_handle_coll_only -from raft.dask.common.comms_utils import perform_test_comms_allreduce -from raft.dask.common.comms_utils import perform_test_comms_send_recv -from raft.dask.common.comms_utils import perform_test_comms_allgather -from raft.dask.common.comms_utils import perform_test_comms_bcast -from raft.dask.common.comms_utils import perform_test_comms_reduce -from raft.dask.common.comms_utils import perform_test_comms_reducescatter +from .comms_utils import inject_comms_on_handle +from .comms_utils import inject_comms_on_handle_coll_only +from .comms_utils import perform_test_comms_allreduce +from .comms_utils import perform_test_comms_send_recv +from .comms_utils import perform_test_comms_allgather +from .comms_utils import perform_test_comms_bcast +from .comms_utils import perform_test_comms_reduce +from .comms_utils import perform_test_comms_reducescatter diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 8272446529..cc62fc66ca 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -41,7 +41,7 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms": cdef cppclass std_comms: pass -cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": +cdef extern from "raft/comms/helper.hpp" namespace "raft::comms": void build_comms_nccl_ucx(handle_t *handle, ncclComm_t comm, @@ -55,6 +55,8 @@ cdef extern from "raft/comms/comms_helper.hpp" namespace "raft::comms": int size, int rank) except + +cdef extern from "raft/comms/test.hpp" namespace "raft::comms": + bool test_collective_allreduce(const handle_t &h, int root) except + bool test_collective_broadcast(const handle_t &h, int root) except + bool test_collective_reduce(const handle_t &h, int root) except + diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx index b72bd3d80b..d55a0e4c42 100644 --- a/python/raft/dask/common/nccl.pyx +++ b/python/raft/dask/common/nccl.pyx @@ -25,11 +25,11 @@ from cython.operator cimport dereference as deref from libcpp cimport bool from libc.stdlib cimport malloc, free -cdef extern from "raft/comms/nccl_helper.hpp" namespace "raft::comms": +cdef extern from "raft/comms/helper.hpp" namespace "raft::comms": void get_unique_id(char *uid, int size) except + - void ncclUniqueIdFromChar(ncclUniqueId *id, - char *uniqueId, - int size) except + + void nccl_unique_id_from_char(ncclUniqueId *id, + char *uniqueId, + int size) except + cdef extern from "nccl.h": @@ -132,7 +132,7 @@ cdef class nccl: self.rank = rank cdef ncclUniqueId *ident = malloc(sizeof(ncclUniqueId)) - ncclUniqueIdFromChar(ident, commId, NCCL_UNIQUE_ID_BYTES) + nccl_unique_id_from_char(ident, commId, NCCL_UNIQUE_ID_BYTES) comm_ = self.comm diff --git a/python/raft/test/conftest.py b/python/raft/test/conftest.py index 83ed6b5d83..68ad9b434f 100644 --- a/python/raft/test/conftest.py +++ b/python/raft/test/conftest.py @@ -12,13 +12,9 @@ @pytest.fixture(scope="module") def cluster(): - - print("Starting cluster") cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) yield cluster - print("Closing cluster") cluster.close() - print("Closed cluster") @pytest.fixture(scope="module") From 28f8101eddb083d017d4a11cae3d5f4d6a1a5f9a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 26 May 2020 15:43:23 -0400 Subject: [PATCH 040/189] Running cpp style check --- cpp/include/raft/comms/comms.hpp | 4 +--- cpp/include/raft/comms/helper.hpp | 13 ++++++++----- cpp/include/raft/comms/test.hpp | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 8f675ddfe4..367b04f240 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -16,8 +16,8 @@ #pragma once -#include #include +#include namespace raft { namespace comms { @@ -87,7 +87,6 @@ constexpr datatype_t get_type() { return datatype_t::FLOAT64; } - class comms_iface { public: virtual ~comms_iface(); @@ -326,6 +325,5 @@ class comms_t { comms_iface::~comms_iface() {} - } // namespace comms } // namespace raft diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 67ba4cab0a..4809c78825 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include #include #include #include @@ -39,7 +39,8 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, int rank) { auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - comms_iface *raft_comm = new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream); + comms_iface *raft_comm = + new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); @@ -83,14 +84,16 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto *raft_comm = new raft::comms::std_comms( - nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream); + auto *raft_comm = + new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, + num_ranks, rank, d_alloc, stream); auto communicator = std::make_shared(std::unique_ptr(raft_comm)); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, int size) { +inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, + int size) { memcpy(id->internal, uniqueId, size); } diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index a3bf81a007..10d4b2c195 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -246,5 +246,5 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { return ret; } -} -}; +} // namespace comms +}; // namespace raft From 1a18553a4cf06be14d05bd96c664a3e3ee71106d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 27 May 2020 10:54:29 -0400 Subject: [PATCH 041/189] Nccl red op --- cpp/include/raft/comms/std_comms.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index ef83730749..4a42276a8f 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -82,6 +82,8 @@ constexpr size_t get_datatype_size(const datatype_t datatype) { return sizeof(float); case datatype_t::FLOAT64: return sizeof(double); + default: + return -1; } } @@ -104,6 +106,8 @@ constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { return ncclFloat; case datatype_t::FLOAT64: return ncclDouble; + default: + return -1; } } @@ -117,6 +121,8 @@ constexpr ncclRedOp_t get_nccl_op(const op_t op) { return ncclMin; case op_t::MAX: return ncclMax; + default: + return -1; } } From a22dd6237299345153b870cd1febe360ed3a8cd5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 27 May 2020 11:57:19 -0400 Subject: [PATCH 042/189] Raising an exception to get around gcc issue --- cpp/include/raft/comms/std_comms.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 4a42276a8f..4fe55bff3c 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -83,7 +83,7 @@ constexpr size_t get_datatype_size(const datatype_t datatype) { case datatype_t::FLOAT64: return sizeof(double); default: - return -1; + throw "Unsupported"; } } @@ -107,7 +107,7 @@ constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { case datatype_t::FLOAT64: return ncclDouble; default: - return -1; + throw "Unsupported"; } } @@ -122,7 +122,7 @@ constexpr ncclRedOp_t get_nccl_op(const op_t op) { case op_t::MAX: return ncclMax; default: - return -1; + throw "Unsupported"; } } From fbd12aa835681fd18d1c8f0e0c2c085a03464c64 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 27 May 2020 12:18:34 -0400 Subject: [PATCH 043/189] Using static for functions for now --- cpp/include/raft/comms/std_comms.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 4fe55bff3c..7b3a8bfbca 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,7 +64,11 @@ namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { + +static size_t get_datatype_size(const datatype_t datatype) { + + size_t ret = -1; + switch (datatype) { case datatype_t::CHAR: return sizeof(char); @@ -87,7 +91,7 @@ constexpr size_t get_datatype_size(const datatype_t datatype) { } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +static ncclDataType_t get_nccl_datatype(const datatype_t datatype) { switch (datatype) { case datatype_t::CHAR: return ncclChar; @@ -111,7 +115,7 @@ constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +static ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { case op_t::SUM: return ncclSum; From cc71ccb660211378052f8e4ba446fbfbed62c90e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 27 May 2020 12:22:35 -0400 Subject: [PATCH 044/189] Fixing style --- cpp/include/raft/comms/std_comms.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7b3a8bfbca..1ba7552f9c 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,9 +64,7 @@ namespace raft { namespace comms { - static size_t get_datatype_size(const datatype_t datatype) { - size_t ret = -1; switch (datatype) { From b612c0bf4031ac72835721bd165324233b086f02 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 27 May 2020 14:42:43 -0500 Subject: [PATCH 045/189] DOC Update BUILD.md --- BUILD.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/BUILD.md b/BUILD.md index e4f1dd4aa9..a8d22f18d9 100644 --- a/BUILD.md +++ b/BUILD.md @@ -140,9 +140,7 @@ Using as an example developer working on cuML and RAFT, we recommend the followi This will facilitate development, and the `RAFT_PATH` variable will make it so that the downstream repository, in this case cuML, builds using the locally cloned RAFT (as descrbed in the first step). -### Submitting PRs - -If you are submitting changes to RAFT itself, without changing downstream repos, you can use the config file located in `ci/prtest.config` to trigger RAFT's CI to run tests of downstream repositories. +### Submitting PRs Guidelines If you have changes to both RAFT and at least one downstream repo, then: From dc123b2d59b938d33da86d245e1a4194bdba4fd7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 28 May 2020 12:23:21 -0400 Subject: [PATCH 046/189] Fixing more relative imports --- python/raft/dask/common/comms.py | 6 +++--- python/raft/test/test_comms.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py index 4ac47e25a6..b49cb8d7b9 100644 --- a/python/raft/dask/common/comms.py +++ b/python/raft/dask/common/comms.py @@ -13,14 +13,14 @@ # limitations under the License. # -from raft.dask.common.nccl import nccl -from raft.dask.common.ucx import UCX +from .nccl import nccl +from .ucx import UCX from .comms_utils import inject_comms_on_handle from .comms_utils import inject_comms_on_handle_coll_only from .utils import parse_host_port -from raft.common.handle import Handle +from ...common.handle import Handle from dask.distributed import get_worker, default_client diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index b065d098af..56da5ad937 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -18,7 +18,7 @@ from dask.distributed import Client from dask.distributed import wait -from raft.dask.common import Comms +from raft.dask import Comms from raft.dask.common import local_handle from raft.dask.common import perform_test_comms_send_recv from raft.dask.common import perform_test_comms_allreduce From 5efa9be09b848c026d0e680984fdf395038e9120 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 28 May 2020 12:15:24 -0500 Subject: [PATCH 047/189] Spectral partition header. --- cpp/include/raft/spectral/partition.hpp | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 cpp/include/raft/spectral/partition.hpp diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp new file mode 100644 index 0000000000..747ce510da --- /dev/null +++ b/cpp/include/raft/spectral/partition.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { + + /// Spectral graph partition + /** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ + template typename GraphView> + int partition(GraphView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t * __restrict__ parts, + weight_t *eigVals, + weight_t *eig_vects); + + /// Compute cost function for partition + /** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param parts (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ + template typename GraphView> + int analyzePartition(GraphView const &graph, + vertex_t nParts, + vertex_t const* __restrict__ parts, + weight_t& edgeCut, weight_t & cost); + +} From f405fee0abaa68679cae06e041fec253c68283cc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 28 May 2020 12:32:33 -0500 Subject: [PATCH 048/189] Updated CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9490099450..d9a391369f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ## New Features - Initial RAFT version - PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes +- PR #12: Spectral Clustering ## Bug Fixes - PR #5: Small build.sh fixes From 739986469a9b91d10275f3e0d920d7f5abcfa2d7 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 2 Jun 2020 10:08:46 -0700 Subject: [PATCH 049/189] add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM --- cpp/CMakeLists.txt | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 947d0318cb..5e7f263b2a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -162,6 +162,23 @@ set(CMAKE_CUDA_FLAGS include(cmake/Dependencies.cmake) +################################################################################################### +# - RMM ------------------------------------------------------------------------------------------- + +find_path(RMM_INCLUDE "rmm" + HINTS "$ENV{RMM_ROOT}/include") + +find_library(RMM_LIBRARY "rmm" + HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build") + +message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}") +message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}") + +add_library(rmm SHARED IMPORTED ${RMM_LIBRARY}) +if(RMM_INCLUDE AND RMM_LIBRARY) + set_target_properties(rmm PROPERTIES IMPORTED_LOCATION ${RMM_LIBRARY}) +endif(RMM_INCLUDE AND RMM_LIBRARY) + ############################################################################## # - include paths ------------------------------------------------------------ @@ -170,10 +187,11 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING set(RAFT_INCLUDE_DIRECTORIES ${RAFT_INCLUDE_DIR} - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + "${RMM_INCLUDE}") if(DEFINED ENV{CONDA_PREFIX}) - message(STATUS "Using RMM installation froM $ENV{CONDA_PREFIX}") + message(STATUS "Using RMM installation from $ENV{CONDA_PREFIX}") list(APPEND RAFT_INCLUDE_DIRECTORIES $ENV{CONDA_PREFIX}/include) endif(DEFINED ENV{CONDA_PREFIX}) @@ -187,7 +205,7 @@ set(RAFT_LINK_LIBRARIES ${CUDA_cusparse_LIBRARY} rmm) -set(RAFT_LINK_DIRECTORIES "") +set(RAFT_LINK_DIRECTORIES "${RMM_LIBRARY}") if(DEFINED ENV{CONDA_PREFIX}) list(APPEND RAFT_LINK_DIRECTORIES $ENV{CONDA_PREFIX}/lib) From 7e99a33e1d5bc71f52c62ae52356c44e6b8b1ad6 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 2 Jun 2020 10:22:45 -0700 Subject: [PATCH 050/189] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9490099450..cdd0bb7eca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features ## Improvements +- PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM ## Bug Fixes From c5bf5389200eca08d03effb488f67e9071008673 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 2 Jun 2020 15:50:10 -0500 Subject: [PATCH 051/189] Adding the main functionality. --- cpp/include/raft/spectral/kmeans.hpp | 935 +++++++++++ cpp/include/raft/spectral/lanczos.hpp | 1487 +++++++++++++++++ .../raft/spectral/modularity_maximization.hpp | 436 +++++ cpp/include/raft/spectral/partition.hpp | 472 +++++- cpp/include/raft/spectral/spectral_matrix.hpp | 1185 +++++++++++++ 5 files changed, 4450 insertions(+), 65 deletions(-) create mode 100644 cpp/include/raft/spectral/kmeans.hpp create mode 100644 cpp/include/raft/spectral/lanczos.hpp create mode 100644 cpp/include/raft/spectral/modularity_maximization.hpp create mode 100644 cpp/include/raft/spectral/spectral_matrix.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp new file mode 100644 index 0000000000..691df3e5ce --- /dev/null +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -0,0 +1,935 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#ifdef NVGRAPH_PARTITION +//#ifdef DEBUG + +#include "include/kmeans.hxx" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/atomics.hxx" +#include "include/debug_macros.h" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/sm_utils.h" + +using namespace nvgraph; + +// ========================================================= +// Useful macros +// ========================================================= + +#define BLOCK_SIZE 1024 +#define WARP_SIZE 32 +#define BSIZE_DIV_WSIZE (BLOCK_SIZE / WARP_SIZE) + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace { + +// ========================================================= +// CUDA kernels +// ========================================================= + +/// Compute distances between observation vectors and centroids +/** Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, d*n entries) Observation matrix. Matrix is + * stored column-major and each column is an observation + * vector. Matrix dimensions are d x n. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Output, n*k entries) Distance matrix. Matrix is + * stored column-major and the (i,j)-entry is the square of the + * Euclidean distance between the ith observation vector and jth + * centroid. Matrix dimensions are n x k. Entries must be + * initialized to zero. + */ +template +static __global__ void computeDistances(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, + ValueType_* __restrict__ dists) +{ + // Loop index + IndexType_ i; + + // Block indices + IndexType_ bidx; + // Global indices + IndexType_ gidx, gidy, gidz; + + // Private memory + ValueType_ centroid_private, dist_private; + + // Global x-index indicates index of vector entry + bidx = blockIdx.x; + while (bidx * blockDim.x < d) { + gidx = threadIdx.x + bidx * blockDim.x; + + // Global y-index indicates centroid + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Load centroid coordinate from global memory + centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; + + // Global z-index indicates observation vector + gidz = threadIdx.z + blockIdx.z * blockDim.z; + while (gidz < n) { + // Load observation vector coordinate from global memory + dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; + + // Compute contribution of current entry to distance + dist_private = centroid_private - dist_private; + dist_private = dist_private * dist_private; + + // Perform reduction on warp + for (i = WARP_SIZE / 2; i > 0; i /= 2) + dist_private += utils::shfl_down(dist_private, i, 2 * i); + + // Write result to global memory + if (threadIdx.x == 0) atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + + // Move to another observation vector + gidz += blockDim.z * gridDim.z; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } + + // Move to another vector entry + bidx += gridDim.x; + } +} + +/// Find closest centroid to observation vectors +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Input/output, n*k entries) Distance matrix. Matrix + * is stored column-major and the (i,j)-entry is the square of + * the Euclidean distance between the ith observation vector and + * jth centroid. Matrix dimensions are n x k. On exit, the first + * n entries give the square of the Euclidean distance between + * observation vectors and closest centroids. + * @param codes (Output, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void minDistances(IndexType_ n, + IndexType_ k, + ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) +{ + // Loop index + IndexType_ i, j; + + // Current matrix entry + ValueType_ dist_curr; + + // Smallest entry in row + ValueType_ dist_min; + IndexType_ code_min; + + // Each row in observation matrix is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Find minimum entry in row + code_min = 0; + dist_min = dists[IDX(i, 0, n)]; + for (j = 1; j < k; ++j) { + dist_curr = dists[IDX(i, j, n)]; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + } + + // Transfer result to global memory + dists[i] = dist_min; + codes[i] = code_min; + + // Increment cluster sizes + atomicAdd(clusterSizes + code_min, 1); + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/// Check if newly computed distances are smaller than old distances +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param dists_old (Input/output, n entries) Distances between + * observation vectors and closest centroids. On exit, entries + * are replaced by entries in 'dists_new' if the corresponding + * observation vectors are closest to the new centroid. + * @param dists_new (Input, n entries) Distance between observation + * vectors and new centroid. + * @param codes_old (Input/output, n entries) Cluster + * assignments. On exit, entries are replaced with 'code_new' if + * the corresponding observation vectors are closest to the new + * centroid. + * @param code_new Index associated with new centroid. + */ +template +static __global__ void minDistances2(IndexType_ n, + ValueType_* __restrict__ dists_old, + const ValueType_* __restrict__ dists_new, + IndexType_* __restrict__ codes_old, + IndexType_ code_new) +{ + // Loop index + IndexType_ i; + + // Distances + ValueType_ dist_old_private; + ValueType_ dist_new_private; + + // Each row is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Get old and new distances + dist_old_private = dists_old[i]; + dist_new_private = dists_new[i]; + + // Update if new distance is smaller than old distance + if (dist_new_private < dist_old_private) { + dists_old[i] = dist_new_private; + codes_old[i] = code_new; + } + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/// Compute size of k-means clusters +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param codes (Input, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void computeClusterSizes(IndexType_ n, + IndexType_ k, + const IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) +{ + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + atomicAdd(clusterSizes + codes[i], 1); + i += blockDim.x * gridDim.x; + } +} + +/// Divide rows of centroid matrix by cluster sizes +/** Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param clusterSizes (Input, k entries) Number of points in each + * cluster. + * @param centroids (Input/output, d*k entries) Sum matrix. Matrix + * is stored column-major and matrix dimensions are d x k. The + * ith column is the sum of all observation vectors in the ith + * cluster. On exit, the matrix is the centroid matrix (each + * column is the mean position of a cluster). + */ +template +static __global__ void divideCentroids(IndexType_ d, + IndexType_ k, + const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids) +{ + // Global indices + IndexType_ gidx, gidy; + + // Current cluster size + IndexType_ clusterSize_private; + + // Observation vector is determined by global y-index + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Get cluster size from global memory + clusterSize_private = clusterSizes[gidy]; + + // Add vector entries to centroid matrix + // Vector entris are determined by global x-index + gidx = threadIdx.x + blockIdx.x * blockDim.x; + while (gidx < d) { + centroids[IDX(gidx, gidy, d)] /= clusterSize_private; + gidx += blockDim.x * gridDim.x; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } +} + +// ========================================================= +// Helper functions +// ========================================================= + +/// Randomly choose new centroids +/** Centroid is randomly chosen with k-means++ algorithm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param rand Random number drawn uniformly from [0,1). + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are n x d. + * @param dists (Input, device memory, 2*n entries) Workspace. The + * first n entries should be the distance between observation + * vectors and the closest centroid. + * @param centroid (Output, device memory, d entries) Centroid + * coordinates. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int chooseNewCentroid(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ rand, + const ValueType_* __restrict__ obs, + ValueType_* __restrict__ dists, + ValueType_* __restrict__ centroid) +{ + using namespace thrust; + + // Cumulative sum of distances + ValueType_* distsCumSum = dists + n; + // Residual sum of squares + ValueType_ distsSum; + // Observation vector that is chosen as new centroid + IndexType_ obsIndex; + + // Compute cumulative sum of distances + inclusive_scan( + device_pointer_cast(dists), device_pointer_cast(dists + n), device_pointer_cast(distsCumSum)); + cudaCheckError(); + CHECK_CUDA( + cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); + + // Randomly choose observation vector + // Probabilities are proportional to square of distance to closest + // centroid (see k-means++ algorithm) + obsIndex = + (lower_bound( + device_pointer_cast(distsCumSum), device_pointer_cast(distsCumSum + n), distsSum * rand) - + device_pointer_cast(distsCumSum)); + cudaCheckError(); + obsIndex = max(obsIndex, 0); + obsIndex = min(obsIndex, n - 1); + + // Record new centroid position + CHECK_CUDA(cudaMemcpyAsync( + centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + + return 0; +} + +/// Choose initial cluster centroids for k-means algorithm +/** Centroids are randomly chosen with k-means++ algorithm + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param dists (Output, device memory, 2*n entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int initializeCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + ValueType_* __restrict__ centroids, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ dists) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Loop index + IndexType_ i; + + // CUDA grid dimensions + dim3 blockDim_warp, gridDim_warp, gridDim_block; + + // Random number generator + thrust::default_random_engine rng(123456); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Initialize grid dimensions + blockDim_warp.x = WARP_SIZE; + blockDim_warp.y = 1; + blockDim_warp.z = BSIZE_DIV_WSIZE; + gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim_warp.y = 1; + gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim_block.y = 1; + gridDim_block.z = 1; + + // Assign observation vectors to code 0 + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + + // Choose first centroid + thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); + cudaCheckError(); + if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids)) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from first centroid + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_))); + computeDistances<<>>(n, d, 1, obs, centroids, dists); + cudaCheckError() + + // Choose remaining centroids + for (i = 1; i < k; ++i) + { + // Choose ith centroid + if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from ith centroid + CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_))); + computeDistances<<>>( + n, d, 1, obs, centroids + IDX(0, i, d), dists + n); + cudaCheckError(); + + // Recompute minimum distances + minDistances2<<>>(n, dists, dists + n, codes, i); + cudaCheckError(); + } + + // Compute cluster sizes + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + computeClusterSizes<<>>(n, k, codes, clusterSizes); + cudaCheckError(); + + return 0; +} + +/// Find cluster centroids closest to observation vectors +/** Distance is measured with Euclidean norm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Input, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param dists (Output, device memory, n*k entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares of assignment. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int assignCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, + ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* residual_host) +{ + // CUDA grid dimensions + dim3 blockDim, gridDim; + + // Compute distance between centroids and observation vectors + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_))); + blockDim.x = WARP_SIZE; + blockDim.y = 1; + blockDim.z = BLOCK_SIZE / WARP_SIZE; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + computeDistances<<>>(n, d, k, obs, centroids, dists); + cudaCheckError(); + + // Find centroid closest to each observation vector + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); + cudaCheckError(); + + // Compute residual sum of squares + *residual_host = + thrust::reduce(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + + return 0; +} + +/// Update cluster centroids for k-means algorithm +/** All clusters are assumed to be non-empty. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Input, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Input, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*d entries) Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int updateCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const IndexType_* __restrict__ codes, + const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, + ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int) +{ + using namespace thrust; + + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // CUDA grid dimensions + dim3 blockDim, gridDim; + + // Device memory + device_ptr obs_copy(work); + device_ptr codes_copy(work_int); + device_ptr rows(work_int + d * n); + + // Take transpose of observation matrix + Cublas::geam( + true, false, n, d, &one, obs, d, &zero, (ValueType_*)NULL, n, raw_pointer_cast(obs_copy), n); + + // Cluster assigned to each observation matrix entry + sequence(rows, rows + d * n); + cudaCheckError(); + transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); + cudaCheckError(); + gather(rows, rows + d * n, device_pointer_cast(codes), codes_copy); + cudaCheckError(); + + // Row associated with each observation matrix entry + sequence(rows, rows + d * n); + cudaCheckError(); + transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); + cudaCheckError(); + + // Sort and reduce to add observation vectors in same cluster + stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); + cudaCheckError(); + reduce_by_key(rows, + rows + d * n, + obs_copy, + codes_copy, // Output to codes_copy is ignored + device_pointer_cast(centroids)); + cudaCheckError(); + + // Divide sums by cluster size to get centroid matrix + blockDim.x = WARP_SIZE; + blockDim.y = BLOCK_SIZE / WARP_SIZE; + blockDim.z = 1; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim.z = 1; + divideCentroids<<>>(d, k, clusterSizes, centroids); + cudaCheckError(); + + return 0; +} + +} // namespace + +namespace nvgraph { + +// ========================================================= +// k-means algorithm +// ========================================================= + +/// Find clusters with k-means algorithm +/** Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*max(k,d) entries) + * Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares (sum of squares of distances between observation + * vectors and centroids). + * @param iters_host (Output, host memory, 1 entry) Number of + * k-means iterations. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR kmeans(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ tol, + IndexType_ maxiter, + const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, + ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int, + ValueType_* residual_host, + IndexType_* iters_host) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Current iteration + IndexType_ iter; + + // Residual sum of squares at previous iteration + ValueType_ residualPrev = 0; + + // Random number generator + thrust::default_random_engine rng(123456); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Initialization + // ------------------------------------------------------- + + // Check that parameters are valid + if (n < 1) { + WARNING("invalid parameter (n<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (d < 1) { + WARNING("invalid parameter (d<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (k < 1) { + WARNING("invalid parameter (k<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxiter < 0) { + WARNING("invalid parameter (maxiter<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + + // Trivial cases + if (k == 1) { + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice)); + if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not compute k-means centroids"); + dim3 blockDim, gridDim; + blockDim.x = WARP_SIZE; + blockDim.y = 1; + blockDim.z = BLOCK_SIZE / WARP_SIZE; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = 1; + gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_))); + computeDistances<<>>(n, d, 1, obs, centroids, work); + cudaCheckError(); + *residual_host = + thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); + cudaCheckError(); + return NVGRAPH_OK; + } + if (n <= k) { + thrust::sequence(thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); + cudaCheckError(); + thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); + cudaCheckError(); + + if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_))); + CHECK_CUDA( + cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + *residual_host = 0; + return NVGRAPH_OK; + } + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // k-means++ algorithm + // ------------------------------------------------------- + + // Choose initial cluster centroids + if (initializeCentroids(n, d, k, obs, centroids, codes, clusterSizes, work)) + WARNING("could not initialize k-means centroids"); + + // Apply k-means iteration until convergence + for (iter = 0; iter < maxiter; ++iter) { + // Update cluster centroids + if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not update k-means centroids"); + + // Determine centroid closest to each observation + residualPrev = *residual_host; + if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + + // Reinitialize empty clusters with new centroids + IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + + // FIXME: emptyCentroid never reaches k (infinite loop) under certain + // conditions, such as if obs is corrupt (as seen as a result of a + // DataFrame column of NULL edge vals used to create the Graph) + while (emptyCentroid < k) { + if (chooseNewCentroid( + n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + WARNING("could not replace empty centroid"); + if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + cudaCheckError(); + } + + // Check for convergence + if (fabs(residualPrev - (*residual_host)) / n < tol) { + ++iter; + break; + } + } + + // Warning if k-means has failed to converge + if (fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + + *iters_host = iter; + return NVGRAPH_OK; +} + +/// Find clusters with k-means algorithm +/** Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * + * CNMEM must be initialized before calling this function. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param residual On exit, residual sum of squares (sum of squares + * of distances between observation vectors and centroids). + * @param On exit, number of k-means iterations. + * @return NVGRAPH error flag + */ +template +NVGRAPH_ERROR kmeans(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ tol, + IndexType_ maxiter, + const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, + ValueType_& residual, + IndexType_& iters) +{ + // Check that parameters are valid + if (n < 1) { + WARNING("invalid parameter (n<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (d < 1) { + WARNING("invalid parameter (d<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (k < 1) { + WARNING("invalid parameter (k<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxiter < 0) { + WARNING("invalid parameter (maxiter<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + + // Allocate memory + // TODO: handle non-zero CUDA streams + cudaStream_t stream = 0; + Vector clusterSizes(k, stream); + Vector centroids(d * k, stream); + Vector work(n * max(k, d), stream); + Vector work_int(2 * d * n, stream); + + // Perform k-means + return kmeans(n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters); +} + +// ========================================================= +// Explicit instantiations +// ========================================================= + +template NVGRAPH_ERROR kmeans(int n, + int d, + int k, + float tol, + int maxiter, + const float* __restrict__ obs, + int* __restrict__ codes, + float& residual, + int& iters); +template NVGRAPH_ERROR kmeans(int n, + int d, + int k, + double tol, + int maxiter, + const double* __restrict__ obs, + int* __restrict__ codes, + double& residual, + int& iters); +} // namespace nvgraph +//#endif //NVGRAPH_PARTITION +//#endif //debug diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp new file mode 100644 index 0000000000..ad49be1c05 --- /dev/null +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -0,0 +1,1487 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#ifdef NVGRAPH_PARTITION + +#define _USE_MATH_DEFINES +#include +#include "include/lanczos.hxx" + +#include +#include +#include + +#include +#include + +#include "include/debug_macros.h" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_error.hxx" +#include "include/nvgraph_lapack.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/nvgraph_vector_kernels.hxx" +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace nvgraph { + +namespace { + +// ========================================================= +// Helper functions +// ========================================================= + +/// Perform Lanczos iteration +/** Lanczos iteration is performed on a shifted matrix A+shift*I. + * + * @param A Matrix. + * @param iter Pointer to current Lanczos iteration. On exit, the + * variable is set equal to the final Lanczos iteration. + * @param maxIter Maximum Lanczos iteration. This function will + * perform a maximum of maxIter-*iter iterations. + * @param shift Matrix shift. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm (i.e. entry in beta_host) is + * less than tol. + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param alpha_host (Output, host memory, maxIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, maxIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Input/output, device memory, + * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as + * columns of a column-major matrix with dimensions + * n x (maxIter+1). + * @param work_dev (Output, device memory, maxIter entries) + * Workspace. Not needed if full reorthogonalization is disabled. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int performLanczosIteration(const Matrix *A, + IndexType_ *iter, + IndexType_ maxIter, + ValueType_ shift, + ValueType_ tol, + bool reorthogonalize, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful variables + const ValueType_ one = 1; + const ValueType_ negOne = -1; + const ValueType_ zero = 0; + + IndexType_ n = A->n; + + // ------------------------------------------------------- + // Compute second Lanczos vector + // ------------------------------------------------------- + if (*iter <= 0) { + *iter = 1; + + // Apply matrix + if (shift != 0) + CHECK_CUDA(cudaMemcpyAsync( + lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); + + // Orthogonalize Lanczos vector + Cublas::dot(n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host); + Cublas::axpy(n, -alpha_host[0], lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1); + beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, 1, n), 1); + + // Check if Lanczos has converged + if (beta_host[0] <= tol) return 0; + + // Normalize Lanczos vector + Cublas::scal(n, 1 / beta_host[0], lanczosVecs_dev + IDX(0, 1, n), 1); + } + + // ------------------------------------------------------- + // Compute remaining Lanczos vectors + // ------------------------------------------------------- + + while (*iter < maxIter) { + ++(*iter); + + // Apply matrix + if (shift != 0) + CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + + // Full reorthogonalization + // "Twice is enough" algorithm per Kahan and Parlett + if (reorthogonalize) { + Cublas::gemv(true, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1); + Cublas::gemv(false, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + CHECK_CUDA(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(ValueType_), + cudaMemcpyDeviceToHost)); + Cublas::gemv(true, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1); + Cublas::gemv(false, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + } + + // Orthogonalization with 3-term recurrence relation + else { + Cublas::dot(n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1)); + Cublas::axpy(n, + -alpha_host[*iter - 1], + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + Cublas::axpy(n, + -beta_host[*iter - 2], + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + } + + // Compute residual + beta_host[*iter - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, *iter, n), 1); + + // Check if Lanczos has converged + if (beta_host[*iter - 1] <= tol) break; + // Normalize Lanczos vector + Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); + } + + CHECK_CUDA(cudaDeviceSynchronize()); + + return 0; +} + +/// Find Householder transform for 3-dimensional system +/** Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * + * @param v (Input/output, host memory, 3 entries) Input + * 3-dimensional vector. On exit, the vector is set to the + * Householder vector. + * @param Pv (Output, host memory, 1 entry) First entry of P*v + * (here v is the input vector). Either equal to ||v||_2 or + * -||v||_2. + * @param P (Output, host memory, 9 entries) Householder transform + * matrix. Matrix dimensions are 3 x 3. + */ +template +static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) +{ + // Compute norm of vector + *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + + // Choose whether to reflect to e_1 or -e_1 + // This choice avoids catastrophic cancellation + if (v[0] >= 0) *Pv = -(*Pv); + v[0] -= *Pv; + + // Normalize Householder vector + ValueType_ normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + if (normHouseholder != 0) { + v[0] /= normHouseholder; + v[1] /= normHouseholder; + v[2] /= normHouseholder; + } else { + v[0] = 0; + v[1] = 0; + v[2] = 0; + } + + // Construct Householder matrix + IndexType_ i, j; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; +} + +/// Apply 3-dimensional Householder transform to 4 x 4 matrix +/** The Householder transform is pre-applied to the top three rows + * of the matrix and post-applied to the left three columns. The + * 4 x 4 matrix is intended to contain the bulge that is produced + * in the Francis QR algorithm. + * + * @param v (Input, host memory, 3 entries) Householder vector. + * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. + */ +template +static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) +{ + // Loop indices + IndexType_ i, j; + // Dot product between Householder vector and matrix row/column + ValueType_ vDotA; + + // Pre-apply Householder transform + for (j = 0; j < 4; ++j) { + vDotA = 0; + for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + } + + // Post-apply Householder transform + for (i = 0; i < 4; ++i) { + vDotA = 0; + for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + } +} + +/// Perform one step of Francis QR algorithm +/** Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * + * @param n Matrix dimension. + * @param shift1 QR algorithm shift. + * @param shift2 QR algorithm shift. + * @param alpha (Input/output, host memory, n entries) Diagonal + * entries of tridiagonal matrix. + * @param beta (Input/output, host memory, n-1 entries) + * Off-diagonal entries of tridiagonal matrix. + * @param V (Input/output, host memory, n*n entries) Orthonormal + * transforms from previous steps of QR algorithm. Matrix + * dimensions are n x n. On exit, the orthonormal transform from + * this Francis QR step is post-applied to the matrix. + * @param work (Output, host memory, 3*n entries) Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int francisQRIteration(IndexType_ n, + ValueType_ shift1, + ValueType_ shift2, + ValueType_ *alpha, + ValueType_ *beta, + ValueType_ *V, + ValueType_ *work) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Temporary storage of 4x4 bulge and Householder vector + ValueType_ bulge[16]; + + // Householder vector + ValueType_ householder[3]; + // Householder matrix + ValueType_ householderMatrix[3 * 3]; + + // Shifts are roots of the polynomial p(x)=x^2+b*x+c + ValueType_ b = -shift1 - shift2; + ValueType_ c = shift1 * shift2; + + // Loop indices + IndexType_ i, j, pos; + // Temporary variable + ValueType_ temp; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute initial Householder transform + householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; + householder[1] = beta[0] * (alpha[0] + alpha[1] + b); + householder[2] = beta[0] * beta[1]; + findHouseholder3(householder, &temp, householderMatrix); + + // Apply initial Householder transform to create bulge + memset(bulge, 0, 16 * sizeof(ValueType_)); + for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 3; ++i) { + bulge[IDX(i + 1, i, 4)] = beta[i]; + bulge[IDX(i, i + 1, 4)] = beta[i]; + } + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(ValueType_)); + + // Chase bulge to bottom-right of matrix with Householder transforms + for (pos = 0; pos < n - 4; ++pos) { + // Move to next position + alpha[pos] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = beta[pos + 3]; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = beta[pos + 3]; + bulge[IDX(3, 3, 4)] = alpha[pos + 4]; + + // Apply Householder transform + findHouseholder3(householder, beta + pos, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); + } + + // Apply penultimate Householder transform + // Values in the last row and column are zero + alpha[n - 4] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = 0; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = 0; + bulge[IDX(3, 3, 4)] = 0; + findHouseholder3(householder, beta + n - 4, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); + + // Apply final Householder transform + // Values in the last two rows and columns are zero + alpha[n - 3] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = 0; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); + + // Bulge has been eliminated + alpha[n - 2] = bulge[IDX(0, 0, 4)]; + alpha[n - 1] = bulge[IDX(1, 1, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; + + return 0; +} + +/// Perform implicit restart of Lanczos algorithm +/** Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * + * @param n Matrix dimension. + * @param iter Current Lanczos iteration. + * @param iter_new Lanczos iteration after restart. + * @param shiftUpper Pointer to upper bound for unwanted + * region. Value is ignored if less than *shiftLower. If a + * stronger upper bound has been found, the value is updated on + * exit. + * @param shiftLower Pointer to lower bound for unwanted + * region. Value is ignored if greater than *shiftUpper. If a + * stronger lower bound has been found, the value is updated on + * exit. + * @param alpha_host (Input/output, host memory, iter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Input/output, host memory, iter entries) + * Off-diagonal entries of Lanczos system. + * @param V_host (Output, host memory, iter*iter entries) + * Orthonormal transform used to obtain restarted system. Matrix + * dimensions are iter x iter. + * @param work_host (Output, host memory, 4*iter entries) + * Workspace. + * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (iter+1). + * @param work_dev (Output, device memory, (n+iter)*iter entries) + * Workspace. + */ +template +static int lanczosRestart(IndexType_ n, + IndexType_ iter, + IndexType_ iter_new, + ValueType_ *shiftUpper, + ValueType_ *shiftLower, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ V_host, + ValueType_ *__restrict__ work_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + bool smallest_eig) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ zero = 0; + const ValueType_ one = 1; + + // Loop index + IndexType_ i; + + // Number of implicit restart steps + // Assumed to be even since each call to Francis algorithm is + // equivalent to two calls of QR algorithm + IndexType_ restartSteps = iter - iter_new; + + // Ritz values from Lanczos method + ValueType_ *ritzVals_host = work_host + 3 * iter; + // Shifts for implicit restart + ValueType_ *shifts_host; + + // Orthonormal matrix for similarity transform + ValueType_ *V_dev = work_dev + n * iter; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute Ritz values + memcpy(ritzVals_host, alpha_host, iter * sizeof(ValueType_)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(ValueType_)); + Lapack::sterf(iter, ritzVals_host, work_host); + + // Debug: Print largest eigenvalues + // for (int i = iter-iter_new; i < iter; ++i) + // std::cout <<*(ritzVals_host+i)<< " "; + // std::cout < *shiftUpper) { + *shiftUpper = ritzVals_host[iter - 1]; + *shiftLower = ritzVals_host[iter_new]; + } else { + *shiftUpper = max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = min(*shiftLower, ritzVals_host[iter_new]); + } + } else { + if (*shiftLower > *shiftUpper) { + *shiftUpper = ritzVals_host[iter - iter_new - 1]; + *shiftLower = ritzVals_host[0]; + } else { + *shiftUpper = max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = min(*shiftLower, ritzVals_host[0]); + } + } + + // Calculate Chebyshev nodes as shifts + shifts_host = ritzVals_host; + for (i = 0; i < restartSteps; ++i) { + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); + shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); + } + + // Apply Francis QR algorithm to implicitly restart Lanczos + for (i = 0; i < restartSteps; i += 2) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + WARNING("error in implicitly shifted QR algorithm"); + + // Obtain new residual + CHECK_CUDA( + cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + Cublas::gemv(false, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1); + + // Obtain new Lanczos vectors + Cublas::gemm( + false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); + + CHECK_CUDA(cudaMemcpyAsync( + lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + + // Normalize residual to obtain new Lanczos vector + CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice)); + beta_host[iter_new - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); + Cublas::scal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + + return 0; +} + +} // namespace + +// ========================================================= +// Eigensolver +// ========================================================= + +/// Compute smallest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix *A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *shift, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // Matrix dimension + IndexType_ n = A->n; + + // Shift for implicit restart + ValueType_ shiftUpper; + ValueType_ shiftLower; + + // Lanczos iteration counters + IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + IndexType_ i; + + // Host memory + ValueType_ *Z_host; // Eigenvectors in Lanczos basis + ValueType_ *work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + if (A->m != A->n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // Compute largest eigenvalue to determine shift + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + + // FIXME: This is hard coded, which is good for unit testing... + // but should really be a parameter so it could be + // "random" for real runs and "fixed" for tests + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 1234567 /*time(NULL)*/)); + // CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + // Initialize initial Lanczos vector + CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); + Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + + // Estimate number of Lanczos iterations + // See bounds in Kuczynski and Wozniakowski (1992). + // const ValueType_ relError = 0.25; // Relative error + // const ValueType_ failProb = 1e-4; // Probability of failure + // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; + // maxIter_curr = min(maxIter_curr, restartIter); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + *shift = 0; + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + + // Determine largest eigenvalue + + Lapack::sterf(*effIter, alpha_host, beta_host); + *shift = -alpha_host[*effIter - 1]; + // std::cout << *shift <(A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + shiftLower = 1; + shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + IndexType_ iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // Obtain desired eigenvalues by applying shift + for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + + // Copy results to device memory + CHECK_CUDA(cudaMemcpy(eigVals_dev, + work_host + 2 * (*effIter), + nEigVecs * sizeof(ValueType_), + cudaMemcpyHostToDevice)); + // for (int i = 0; i < nEigVecs; ++i) + //{ + // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; + //} + CHECK_CUDA(cudaMemcpy( + work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + + // Convert eigenvectors from Lanczos basis to standard basis + Cublas::gemm(false, + false, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n); + + // Clean up and exit + CHECK_CURAND(curandDestroyGenerator(randGen)); + return NVGRAPH_OK; +} + +/// Compute smallest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * + * CNMEM must be initialized before calling this function. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param iter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Smallest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // CUDA stream + // TODO: handle non-zero streams + cudaStream_t stream = 0; + + // Matrix dimension + IndexType_ n = A.n; + + // Check that parameters are valid + if (A.m != A.n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + ValueType_ *alpha_host = alpha_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); + + Vector lanczosVecs_dev(n * (restartIter + 1), stream); + Vector work_dev((n + restartIter) * restartIter, stream); + + // Perform Lanczos method + IndexType_ effIter; + ValueType_ shift; + NVGRAPH_ERROR status = computeSmallestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev); + + // Clean up and return + return status; +} + +// ========================================================= +// Eigensolver +// ========================================================= + +/// Compute largest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // Matrix dimension + IndexType_ n = A->n; + + // Lanczos iteration counters + IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + IndexType_ i; + + // Host memory + ValueType_ *Z_host; // Eigenvectors in Lanczos basis + ValueType_ *work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + if (A->m != A->n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // Compute largest eigenvalue + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456)); + // Initialize initial Lanczos vector + CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); + Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + + // Estimate number of Lanczos iterations + // See bounds in Kuczynski and Wozniakowski (1992). + // const ValueType_ relError = 0.25; // Relative error + // const ValueType_ failProb = 1e-4; // Probability of failure + // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; + // maxIter_curr = min(maxIter_curr, restartIter); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + ValueType_ shift_val = 0.0; + ValueType_ *shift = &shift_val; + // maxIter_curr = min(maxIter, restartIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + ValueType_ shiftLower = 1; + ValueType_ shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + IndexType_ iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + for (int i = 0; i < restartIter; ++i) { + for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + } + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // note: We need to pick the top nEigVecs eigenvalues + // but effItter can be larger than nEigVecs + // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the + // matrix of size effIter. remember the array is sorted, so it is not needed for smallest + // eigenvalues case because the first ones are the smallest ones + + IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs; + + // Debug : print nEigVecs largest eigenvalues + // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) + // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; + // std::cout < +NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // CUDA stream + // TODO: handle non-zero streams + cudaStream_t stream = 0; + + // Matrix dimension + IndexType_ n = A.n; + + // Check that parameters are valid + if (A.m != A.n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + ValueType_ *alpha_host = alpha_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); + + Vector lanczosVecs_dev(n * (restartIter + 1), stream); + Vector work_dev((n + restartIter) * restartIter, stream); + + // Perform Lanczos method + IndexType_ effIter; + NVGRAPH_ERROR status = computeLargestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev); + + // Clean up and return + return status; +} + +// ========================================================= +// Explicit instantiation +// ========================================================= + +template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + float tol, + bool reorthogonalize, + int &iter, + float *__restrict__ eigVals_dev, + float *__restrict__ eigVecs_dev); +template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + double tol, + bool reorthogonalize, + int &iter, + double *__restrict__ eigVals_dev, + double *__restrict__ eigVecs_dev); + +template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + float tol, + bool reorthogonalize, + int &iter, + float *__restrict__ eigVals_dev, + float *__restrict__ eigVecs_dev); +template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + double tol, + bool reorthogonalize, + int &iter, + double *__restrict__ eigVals_dev, + double *__restrict__ eigVecs_dev); + +} // namespace nvgraph diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp new file mode 100644 index 0000000000..bd90f3093a --- /dev/null +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +//#ifdef NVGRAPH_PARTITION + +#include "include/modularity_maximization.hxx" + +#include +#include + +#include +#include +#include +#include +#include + +#include "include/debug_macros.h" +#include "include/kmeans.hxx" +#include "include/lanczos.hxx" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_error.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/sm_utils.h" +#include "include/spectral_matrix.hxx" + +//#define COLLECT_TIME_STATISTICS 1 +//#undef COLLECT_TIME_STATISTICS + +#ifdef COLLECT_TIME_STATISTICS +#include +#include +#include +#include +#include "cuda_profiler_api.h" +#endif + +#ifdef COLLECT_TIME_STATISTICS +static double timer(void) +{ + struct timeval tv; + cudaDeviceSynchronize(); + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} +#endif + +namespace nvgraph { + +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) +{ + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + cudaCheckError(); + + return cudaSuccess; +} + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR modularity_maximization( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nClusters, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ clusters, + weight_t *eigVals, + weight_t *eigVecs, + int &iters_lanczos, + int &iters_kmeans) +{ + cudaStream_t stream = 0; + const weight_t zero{0.0}; + const weight_t one{1.0}; + + edge_t i; + edge_t n = graph.number_of_vertices; + + // k-means residual + weight_t residual_kmeans; + + // Compute eigenvectors of Modularity Matrix + // Initialize Modularity Matrix + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + ModularityMatrix B(A, graph.number_of_edges); + + // Compute smallest eigenvalues and eigenvectors + CHECK_NVGRAPH(computeLargestEigenvectors(B, + nEigVecs, + maxIter_lanczos, + restartIter_lanczos, + tol_lanczos, + false, + iters_lanczos, + eigVals, + eigVecs)); + + // eigVals.dump(0, nEigVecs); + // eigVecs.dump(0, nEigVecs); + // eigVecs.dump(n, nEigVecs); + // eigVecs.dump(2*n, nEigVecs); + // Whiten eigenvector matrix + for (i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + cudaCheckError(); + mean /= n; + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + cudaCheckError(); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + cudaCheckError(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + Vector work(nEigVecs * n, stream); + Cublas::set_pointer_mode_host(); + Cublas::geam(true, + false, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t *)NULL, + nEigVecs, + work.raw(), + nEigVecs); + CHECK_CUDA(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + } + + // WARNING: notice that at this point the matrix has already been transposed, so we are scaling + // columns + scale_obs(nEigVecs, n, eigVecs); + cudaCheckError(); + + // eigVecs.dump(0, nEigVecs*n); + // Find partition with k-means clustering + CHECK_NVGRAPH(kmeans(n, + nEigVecs, + nClusters, + tol_kmeans, + maxIter_kmeans, + eigVecs, + clusters, + residual_kmeans, + iters_kmeans)); + + return NVGRAPH_OK; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param parts (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nClusters, + const vertex_t *__restrict__ parts, + weight_t &modularity) +{ + cudaStream_t stream = 0; + edge_t i; + edge_t n = graph.number_of_vertices; + weight_t partModularity, partSize; + + // Device memory + Vector part_i(n, stream); + Vector Bx(n, stream); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // Initialize Modularity + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + ModularityMatrix B(A, graph.number_of_edges); + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + cudaCheckError(); + + // Compute size of ith partition + Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + partSize = round(partSize); + if (partSize < 0.5) { + WARNING("empty partition"); + continue; + } + + // Compute modularity + B.mv(1, part_i.raw(), 0, Bx.raw()); + Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity); + + // Record results + modularity += partModularity; + // std::cout<< "partModularity " <( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + float tol_lanczos, + int maxIter_kmeans, + float tol_kmeans, + int *__restrict__ parts, + float *eigVals, + float *eigVecs, + int &iters_lanczos, + int &iters_kmeans); +template NVGRAPH_ERROR modularity_maximization( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + double tol_lanczos, + int maxIter_kmeans, + double tol_kmeans, + int *__restrict__ parts, + double *eigVals, + double *eigVecs, + int &iters_lanczos, + int &iters_kmeans); +template NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + const int *__restrict__ parts, + float &modularity); +template NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + const int *__restrict__ parts, + double &modularity); + +} // namespace nvgraph +//#endif //NVGRAPH_PARTITION diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 747ce510da..e4b9f50790 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,69 +14,411 @@ * limitations under the License. */ -#pragma once - -namespace raft { - - /// Spectral graph partition - /** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return error flag. - */ - template typename GraphView> - int partition(GraphView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t * __restrict__ parts, - weight_t *eigVals, - weight_t *eig_vects); - - /// Compute cost function for partition - /** This function determines the edges cut by a partition and a cost - * function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - * @return error flag. - */ - template typename GraphView> - int analyzePartition(GraphView const &graph, - vertex_t nParts, - vertex_t const* __restrict__ parts, - weight_t& edgeCut, weight_t & cost); +#include "include/partition.hxx" +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvgraph { + +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) +{ + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + cudaCheckError(); + + return cudaSuccess; +} + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ parts, + weight_t *eigVals, + weight_t *eigVecs) +{ + cudaStream_t stream = 0; + + const weight_t zero{0.0}; + const weight_t one{1.0}; + + int iters_lanczos; + int iters_kmeans; + + edge_t i; + edge_t n = graph.number_of_vertices; + + // k-means residual + weight_t residual_kmeans; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + LaplacianMatrix L(A); + + // Compute smallest eigenvalues and eigenvectors + CHECK_NVGRAPH(computeSmallestEigenvectors(L, + nEigVecs, + maxIter_lanczos, + restartIter_lanczos, + tol_lanczos, + false, + iters_lanczos, + eigVals, + eigVecs)); + + // Whiten eigenvector matrix + for (i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + cudaCheckError(); + mean /= n; + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + cudaCheckError(); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + cudaCheckError(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + Vector work(nEigVecs * n, stream); + Cublas::set_pointer_mode_host(); + Cublas::geam(true, + false, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t *)NULL, + nEigVecs, + work.raw(), + nEigVecs); + CHECK_CUDA(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + } + + // Clean up + + // eigVecs.dump(0, nEigVecs*n); + // Find partition with k-means clustering + CHECK_NVGRAPH(kmeans(n, + nEigVecs, + nParts, + tol_kmeans, + maxIter_kmeans, + eigVecs, + parts, + residual_kmeans, + iters_kmeans)); + + return NVGRAPH_OK; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param parts (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + const vertex_t *__restrict__ parts, + weight_t &edgeCut, + weight_t &cost) +{ + cudaStream_t stream = 0; + + edge_t i; + edge_t n = graph.number_of_vertices; + + weight_t partEdgesCut, partSize; + + // Device memory + Vector part_i(n, stream); + Vector Lx(n, stream); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // Initialize Laplacian + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + LaplacianMatrix L(A); + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nParts; ++i) { + // Construct indicator vector for ith partition + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + cudaCheckError(); + + // Compute size of ith partition + Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + partSize = round(partSize); + if (partSize < 0.5) { + WARNING("empty partition"); + continue; + } + + // Compute number of edges cut by ith partition + L.mv(1, part_i.raw(), 0, Lx.raw()); + Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut); + + // Record results + cost += partEdgesCut / partSize; + edgeCut += partEdgesCut / 2; + } + + // Clean up and return + return NVGRAPH_OK; } + +// ========================================================= +// Explicit instantiation +// ========================================================= +template NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + float tol_lanczos, + int maxIter_kmeans, + float tol_kmeans, + int *__restrict__ parts, + float *eigVals, + float *eigVecs); + +template NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + double tol_lanczos, + int maxIter_kmeans, + double tol_kmeans, + int *__restrict__ parts, + double *eigVals, + double *eigVecs); + +template NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + const int *__restrict__ parts, + float &edgeCut, + float &cost); +template NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + const int *__restrict__ parts, + double &edgeCut, + double &cost); + +} // namespace nvgraph diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp new file mode 100644 index 0000000000..c77bb8e5a0 --- /dev/null +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -0,0 +1,1185 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// #include +// #include +// #include +// #include +// #include + +#include + +// CUDA block size +#define BLOCK_SIZE 1024 + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace raft { +namespace matrix { + void check_size(size_t sz) + { + if (sz > INT_MAX) FatalError("Vector larger than INT_MAX", ERR_BAD_PARAMETERS); + } + template + void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec); + *res = thrust::reduce(dev_ptr, dev_ptr + n); + cudaCheckError(); + } + + template + void fill_raw_vec(ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec); + thrust::fill(dev_ptr, dev_ptr + n, value); + cudaCheckError(); + } + + template + void dump_raw_vec(ValueType_* vec, size_t n, int offset, cudaStream_t stream) + { +#ifdef DEBUG + thrust::device_ptr dev_ptr(vec); + COUT().precision(15); + COUT() << "sample size = " << n << ", offset = " << offset << std::endl; + thrust::copy( + dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(COUT(), " ")); + cudaCheckError(); + COUT() << std::endl; +#endif + } + + template + __global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags) + { + int tidx = blockDim.x * blockIdx.x + threadIdx.x; + for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x) { + if (vec[r] != 0.0) + flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha) + else + flags[r] = 0; + } + } + template + __global__ void dmv0_kernel(const ValueType_* __restrict__ D, + const ValueType_* __restrict__ x, + ValueType_* __restrict__ y, + int n) + { + // y=D*x + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] = D[i] * x[i]; + } + template + __global__ void dmv1_kernel(const ValueType_* __restrict__ D, + const ValueType_* __restrict__ x, + ValueType_* __restrict__ y, + int n) + { + // y+=D*x + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] += D[i] * x[i]; + } + template + void copy_vec(ValueType_* vec1, size_t n, ValueType_* res, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec1); + thrust::device_ptr res_ptr(res); +#ifdef DEBUG + // COUT() << "copy "<< n << " elements" << std::endl; +#endif + thrust::copy_n(dev_ptr, n, res_ptr); + cudaCheckError(); + // dump_raw_vec (res, n, 0); + } + + template + void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream) + { + int items_per_thread = 4; + int num_threads = 128; + int max_grid_size = 4096; + check_size(num_vertices); + int n = static_cast(num_vertices); + int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); + flag_zeroes_kernel<<>>(num_vertices, vec, flags); + cudaCheckError(); + } + + template + void dmv(size_t num_vertices, + ValueType_ alpha, + ValueType_* D, + ValueType_* x, + ValueType_ beta, + ValueType_* y, + cudaStream_t stream) + { + int items_per_thread = 4; + int num_threads = 128; + int max_grid_size = 4096; + check_size(num_vertices); + int n = static_cast(num_vertices); + int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); + if (alpha == 1.0 && beta == 0.0) + dmv0_kernel<<>>(D, x, y, n); + else if (alpha == 1.0 && beta == 1.0) + dmv1_kernel<<>>(D, x, y, n); + else + FatalError("Not implemented case of y = D*x", ERR_BAD_PARAMETERS); + + cudaCheckError(); + } + + template + void set_connectivity(size_t n, + IndexType_ root, + ValueType_ self_loop_val, + ValueType_ unreachable_val, + ValueType_* res, + cudaStream_t stream) + { + fill_raw_vec(res, n, unreachable_val); + cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); + cudaCheckError(); + } + + + /*! A Vector contains a device vector of size |E| and type T + */ + template + class Vector { + public: + typedef ValueType_ ValueType; + + protected: + rmm::device_vector values; + + public: + /*! Construct an empty \p Vector. + */ + Vector(void) {} + ~Vector(void) {} + /*! Construct a \p Vector of size vertices. + * + * \param vertices The size of the Vector + */ + Vector(size_t vertices, cudaStream_t stream = 0) + : values(vertices) {} + + size_t get_size() const { return values.size(); } + size_t bytes() const { return values.size()*sizeof(ValueType);} + ValueType const *raw() const { return values.data().get(); } + ValueType *raw() { return values.data().get(); } + + void allocate(size_t n, cudaStream_t stream = 0) + { + values.resize(n); + } + + void fill(ValueType val, cudaStream_t stream = 0) + { + fill_raw_vec(this->raw(), this->get_size(), val, stream); + } + + void copy(Vector &vec1, cudaStream_t stream = 0) + { + if (this->get_size() == 0 && vec1.get_size()>0) { + allocate(vec1.get_size(), stream); + copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); + } else if (this->get_size() == vec1.get_size()) + copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); + else if (this->get_size() > vec1.get_size()) { + copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); + } else { + FatalError("Cannot copy a vector into a smaller one", ERR_BAD_PARAMETERS); + } + } + + ValueType nrm1(cudaStream_t stream = 0) { + ValueType res = 0; + nrm1_raw_vec(this->raw(), this->get_size(), &res, stream); + return res; + } + }; // class Vector + + /// Abstract matrix class + /** Derived classes must implement matrix-vector products. + */ + template + class Matrix { + public: + /// Number of rows + const IndexType_ m; + /// Number of columns + const IndexType_ n; + /// CUDA stream + cudaStream_t s; + + /// Constructor + /** @param _m Number of rows. + * @param _n Number of columns. + */ + Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){} + + /// Destructor + virtual ~Matrix() {} + + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s) = 0; + virtual void getCUDAStream(cudaStream_t *_s) = 0; + + /// Matrix-vector product + /** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output + * vector. + */ + virtual void mv(ValueType_ alpha, + const ValueType_ * __restrict__ x, + ValueType_ beta, + ValueType_ * __restrict__ y) const = 0; + + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0; + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const = 0; + virtual void reorder(IndexType_ *p) const = 0; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M) = 0; + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const = 0; + }; + + /// Dense matrix class + template + class DenseMatrix : public Matrix { + + private: + /// Whether to transpose matrix + const bool trans; + /// Matrix entries, stored column-major in device memory + const ValueType_ * A; + /// Leading dimension of matrix entry array + const IndexType_ lda; + + public: + /// Constructor + DenseMatrix(bool _trans, + IndexType_ _m, IndexType_ _n, + const ValueType_ * _A, IndexType_ _lda); + + /// Destructor + virtual ~DenseMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Sparse matrix class in CSR format + template + class CsrMatrix : public Matrix { + + private: + /// Whether to transpose matrix + const bool trans; + /// Whether matrix is stored in symmetric format + const bool sym; + /// Number of non-zero entries + const IndexType_ nnz; + /// Matrix properties + const cusparseMatDescr_t descrA; + /// Matrix entry values (device memory) + /*const*/ ValueType_ * csrValA; + /// Pointer to first entry in each row (device memory) + const IndexType_ * csrRowPtrA; + /// Column index of each matrix entry (device memory) + const IndexType_ * csrColIndA; + /// Analysis info (pointer to opaque CUSPARSE struct) + cusparseSolveAnalysisInfo_t info_l; + cusparseSolveAnalysisInfo_t info_u; + /// factored flag (originally set to false, then reset to true after factorization), + /// notice we only want to factor once + bool factored; + + public: + /// Constructor + CsrMatrix(bool _trans, bool _sym, + IndexType_ _m, IndexType_ _n, IndexType_ _nnz, + const cusparseMatDescr_t _descrA, + /*const*/ ValueType_ * _csrValA, + const IndexType_ * _csrRowPtrA, + const IndexType_ * _csrColIndA); + + /// Destructor + virtual ~CsrMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Graph Laplacian matrix + template + class LaplacianMatrix + : public Matrix { + + private: + /// Adjacency matrix + /*const*/ Matrix * A; + /// Degree of each vertex + Vector D; + /// Preconditioning matrix + Matrix * M; + + public: + /// Constructor + LaplacianMatrix(/*const*/ Matrix & _A); + + /// Destructor + virtual ~LaplacianMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Scale a set of k vectors by a diagonal + virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Solve preconditioned system M x = f for a set of k vectors + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Modularity matrix + template + class ModularityMatrix + : public Matrix { + + private: + /// Adjacency matrix + /*const*/ Matrix * A; + /// Degree of each vertex + Vector D; + IndexType_ nnz; + ValueType_ edge_sum; + + /// Preconditioning matrix + Matrix * M; + + public: + /// Constructor + ModularityMatrix(/*const*/ Matrix & _A, IndexType_ _nnz); + + /// Destructor + virtual ~ModularityMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Scale a set of k vectors by a diagonal + virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Solve preconditioned system M x = f for a set of k vectors + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + +// ============================================= +// CUDA kernels +// ============================================= + +namespace { + +/// Apply diagonal matrix to vector +template +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ *__restrict__ y) +{ + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + y[i] += alpha * D[i] * x[i]; + i += blockDim.x * gridDim.x; + } +} + +/// Apply diagonal matrix to a set of dense vectors (tall matrix) +template +static __global__ void diagmm(IndexType_ n, + IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) +{ + IndexType_ i, j, index; + + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < k; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { + index = i + j * n; + if (beta_is_zero) { + y[index] = alpha * D[i] * x[index]; + } else { + y[index] = alpha * D[i] * x[index] + beta * y[index]; + } + } + } +} +} // namespace + +// ============================================= +// Dense matrix class +// ============================================= + +/// Constructor for dense matrix class +/** @param _trans Whether to transpose matrix. + * @param _m Number of rows. + * @param _n Number of columns. + * @param _A (Input, device memory, _m*_n entries) Matrix + * entries, stored column-major. + * @param _lda Leading dimension of _A. + */ +template +DenseMatrix::DenseMatrix( + bool _trans, IndexType_ _m, IndexType_ _n, const ValueType_ *_A, IndexType_ _lda) + : Matrix(_m, _n), trans(_trans), A(_A), lda(_lda) +{ + Cublas::set_pointer_mode_host(); + if (_lda < _m) FatalError("invalid dense matrix parameter (lda +DenseMatrix::~DenseMatrix() +{ +} + +/// Get and Set CUDA stream +template +void DenseMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("DenseMatrix setCUDAStream stream=%p\n",this->s); + Cublas::setStream(_s); +} +template +void DenseMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // CHECK_CUBLAS(cublasGetStream(cublasHandle, _s)); +} + +/// Matrix-vector product for dense matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void DenseMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + Cublas::gemv(this->trans, this->m, this->n, &alpha, this->A, this->lda, x, 1, &beta, y, 1); +} + +template +void DenseMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + Cublas::gemm( + this->trans, false, this->m, k, this->n, &alpha, A, lda, x, this->m, &beta, y, this->n); +} + +/// Color and Reorder +template +void DenseMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void DenseMatrix::reorder(IndexType_ *p) const +{ +} + +/// Incomplete Cholesky (setup, factor and solve) +template +void DenseMatrix::prec_setup(Matrix *_M) +{ + printf("ERROR: DenseMatrix prec_setup dispacthed\n"); + // exit(1); +} + +template +void DenseMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + printf("ERROR: DenseMatrix prec_solve dispacthed\n"); + // exit(1); +} + +template +ValueType_ DenseMatrix::getEdgeSum() const +{ + return 0.0; +} + +// ============================================= +// CSR matrix class +// ============================================= + +/// Constructor for CSR matrix class +/** @param _transA Whether to transpose matrix. + * @param _m Number of rows. + * @param _n Number of columns. + * @param _nnz Number of non-zero entries. + * @param _descrA Matrix properties. + * @param _csrValA (Input, device memory, _nnz entries) Matrix + * entry values. + * @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer + * to first entry in each row. + * @param _csrColIndA (Input, device memory, _nnz entries) Column + * index of each matrix entry. + */ +template +CsrMatrix::CsrMatrix(bool _trans, + bool _sym, + IndexType_ _m, + IndexType_ _n, + IndexType_ _nnz, + const cusparseMatDescr_t _descrA, + /*const*/ ValueType_ *_csrValA, + const IndexType_ *_csrRowPtrA, + const IndexType_ *_csrColIndA) + : Matrix(_m, _n), + trans(_trans), + sym(_sym), + nnz(_nnz), + descrA(_descrA), + csrValA(_csrValA), + csrRowPtrA(_csrRowPtrA), + csrColIndA(_csrColIndA) +{ + if (nnz < 0) FatalError("invalid CSR matrix parameter (nnz<0)", NVGRAPH_ERR_BAD_PARAMETERS); + Cusparse::set_pointer_mode_host(); +} + +/// Destructor for CSR matrix class +template +CsrMatrix::~CsrMatrix() +{ +} + +/// Get and Set CUDA stream +template +void CsrMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("CsrMatrix setCUDAStream stream=%p\n",this->s); + Cusparse::setStream(_s); +} +template +void CsrMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s)); +} +template +void CsrMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, + // descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m)); + Cusparse::csrmm(this->trans, + this->sym, + this->m, + k, + this->n, + this->nnz, + &alpha, + this->csrValA, + this->csrRowPtrA, + this->csrColIndA, + x, + this->n, + &beta, + y, + this->m); +} + +/// Color and Reorder +template +void CsrMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void CsrMatrix::reorder(IndexType_ *p) const +{ +} + +/// Incomplete Cholesky (setup, factor and solve) +template +void CsrMatrix::prec_setup(Matrix *_M) +{ + // printf("CsrMatrix prec_setup dispacthed\n"); + if (!factored) { + // analyse lower triangular factor + CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l)); + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + nnz, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l)); + // analyse upper triangular factor + CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u)); + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + nnz, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_u)); + // perform csrilu0 (should be slightly faster than csric0) + CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l)); + // set factored flag to true + factored = true; + } +} + +template +void CsrMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + // printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s); + + // preconditioning Mx=f (where M = L*U, threfore x=U\(L\f)) + // solve lower triangular factor + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + k, + alpha, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l, + fx, + this->m, + t, + this->m)); + // solve upper triangular factor + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + k, + alpha, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_u, + t, + this->m, + fx, + this->m)); +} + +/// Matrix-vector product for CSR matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void CsrMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // TODO: consider using merge-path csrmv + Cusparse::csrmv(this->trans, + this->sym, + this->m, + this->n, + this->nnz, + &alpha, + this->csrValA, + this->csrRowPtrA, + this->csrColIndA, + x, + &beta, + y); +} + +template +ValueType_ CsrMatrix::getEdgeSum() const +{ + return 0.0; +} + +// ============================================= +// Laplacian matrix class +// ============================================= + +/// Constructor for Laplacian matrix class +/** @param A Adjacency matrix + */ +template +LaplacianMatrix::LaplacianMatrix( + /*const*/ Matrix &_A) + : Matrix(_A.m, _A.n), A(&_A) +{ + // Check that adjacency matrix is square + if (_A.m != _A.n) + FatalError("cannot construct Laplacian matrix from non-square adjacency matrix", + NVGRAPH_ERR_BAD_PARAMETERS); + // set CUDA stream + this->s = NULL; + // Construct degree matrix + D.allocate(_A.m, this->s); + Vector ones(this->n, this->s); + ones.fill(1.0); + _A.mv(1, ones.raw(), 0, D.raw()); + + // Set preconditioning matrix pointer to NULL + M = NULL; +} + +/// Destructor for Laplacian matrix class +template +LaplacianMatrix::~LaplacianMatrix() +{ +} + +/// Get and Set CUDA stream +template +void LaplacianMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); + A->setCUDAStream(_s); + if (M != NULL) { M->setCUDAStream(_s); } +} +template +void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // A->getCUDAStream(_s); +} + +/// Matrix-vector product for Laplacian matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void LaplacianMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Scale result vector + if (beta == 0) + CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) + else if (beta != 1) + thrust::transform(thrust::device_pointer_cast(y), + thrust::device_pointer_cast(y + this->n), + thrust::make_constant_iterator(beta), + thrust::device_pointer_cast(y), + thrust::multiplies()); + + // Apply diagonal matrix + dim3 gridDim, blockDim; + gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + diagmv<<s>>>(this->n, alpha, D.raw(), x, y); + cudaCheckError(); + + // Apply adjacency matrix + A->mv(-alpha, x, 1, y); +} +/// Matrix-vector product for Laplacian matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n*k entries) nxk dense matrix. + * @param beta Scalar. + * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. + */ +template +void LaplacianMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Apply diagonal matrix + ValueType_ one = (ValueType_)1.0; + this->dm(k, alpha, x, beta, y); + + // Apply adjacency matrix + A->mm(k, -alpha, x, one, y); +} + +template +void LaplacianMatrix::dm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + IndexType_ t = k * (this->n); + dim3 gridDim, blockDim; + + // setup launch parameters + gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + + // Apply diagonal matrix + if (beta == 0.0) { + // set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner + // case) + CHECK_CUDA(cudaMemset(y, 0, t * sizeof(ValueType_))); + diagmm + <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); + } else { + diagmm + <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); + } + cudaCheckError(); +} + +/// Color and Reorder +template +void LaplacianMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void LaplacianMatrix::reorder(IndexType_ *p) const +{ +} + +/// Solve preconditioned system M x = f for a set of k vectors +template +void LaplacianMatrix::prec_setup(Matrix *_M) +{ + // save the pointer to preconditioner M + M = _M; + if (M != NULL) { + // setup the preconditioning matrix M + M->prec_setup(NULL); + } +} + +template +void LaplacianMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + if (M != NULL) { + // preconditioning + M->prec_solve(k, alpha, fx, t); + } +} + +template +ValueType_ LaplacianMatrix::getEdgeSum() const +{ + return 0.0; +} +// ============================================= +// Modularity matrix class +// ============================================= + +/// Constructor for Modularity matrix class +/** @param A Adjacency matrix + */ +template +ModularityMatrix::ModularityMatrix( + /*const*/ Matrix &_A, IndexType_ _nnz) + : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) +{ + // Check that adjacency matrix is square + if (_A.m != _A.n) + FatalError("cannot construct Modularity matrix from non-square adjacency matrix", + NVGRAPH_ERR_BAD_PARAMETERS); + + // set CUDA stream + this->s = NULL; + // Construct degree matrix + D.allocate(_A.m, this->s); + Vector ones(this->n, this->s); + ones.fill(1.0); + _A.mv(1, ones.raw(), 0, D.raw()); + // D.dump(0,this->n); + edge_sum = D.nrm1(); + + // Set preconditioning matrix pointer to NULL + M = NULL; +} + +/// Destructor for Modularity matrix class +template +ModularityMatrix::~ModularityMatrix() +{ +} + +/// Get and Set CUDA stream +template +void ModularityMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); + A->setCUDAStream(_s); + if (M != NULL) { M->setCUDAStream(_s); } +} + +template +void ModularityMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // A->getCUDAStream(_s); +} + +/// Matrix-vector product for Modularity matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void ModularityMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Scale result vector + if (alpha != 1 || beta != 0) + FatalError("This isn't implemented for Modularity Matrix currently", + NVGRAPH_ERR_NOT_IMPLEMENTED); + + // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, + // double *result)); + // y = A*x + A->mv(alpha, x, 0, y); + ValueType_ dot_res; + // gamma = d'*x + Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); + // y = y -(gamma/edge_sum)*d + Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); +} +/// Matrix-vector product for Modularity matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n*k entries) nxk dense matrix. + * @param beta Scalar. + * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. + */ +template +void ModularityMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +template +void ModularityMatrix::dm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +/// Color and Reorder +template +void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +template +void ModularityMatrix::reorder(IndexType_ *p) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +/// Solve preconditioned system M x = f for a set of k vectors +template +void ModularityMatrix::prec_setup(Matrix *_M) +{ + // save the pointer to preconditioner M + M = _M; + if (M != NULL) { + // setup the preconditioning matrix M + M->prec_setup(NULL); + } +} + +template +void ModularityMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + if (M != NULL) { + FatalError("This isn't implemented for Modularity Matrix currently", + NVGRAPH_ERR_NOT_IMPLEMENTED); + } +} + +template +ValueType_ ModularityMatrix::getEdgeSum() const +{ + return edge_sum; +} + +} // namespace matrix +} // namespace raft From 60597f3acf0691536e6d817a01072bd573896f5e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 3 Jun 2020 11:05:27 -0400 Subject: [PATCH 052/189] final updates based on feedback --- cpp/CMakeLists.txt | 2 -- cpp/cmake/comms.cmake | 2 -- 2 files changed, 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a45cf8c950..e6ae09cb35 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -67,8 +67,6 @@ find_package(CUDA 10.0 REQUIRED) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -message("HELLO!") - ############################################################################## # - Compiler Options -------------------------------------------------------- diff --git a/cpp/cmake/comms.cmake b/cpp/cmake/comms.cmake index c8496c7dc6..eab605dca3 100644 --- a/cpp/cmake/comms.cmake +++ b/cpp/cmake/comms.cmake @@ -17,8 +17,6 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(comms LANGUAGES CXX CUDA) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") - if(NOT NCCL_PATH) find_package(NCCL REQUIRED) else() From a92846874481c947f345b2073b3dc7a58927788d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 11:06:20 -0400 Subject: [PATCH 053/189] copy error.hpp from cuDF, add license statement, and initial update --- cpp/include/raft/error.hpp | 124 +++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 cpp/include/raft/error.hpp diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp new file mode 100644 index 0000000000..a75d98cf4d --- /dev/null +++ b/cpp/include/raft/error.hpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + + +namespace raft { + +/** + * @brief Exception thrown when logical precondition is violated. + * + * This exception should not be thrown directly and is instead thrown by the + * RAFT_EXPECTS macro. + * + */ +struct logic_error : public std::logic_error { + logic_error(char const* const message) : std::logic_error(message) {} + + logic_error(std::string const& message) : std::logic_error(message) {} +}; + +/** + * @brief Exception thrown when a CUDA error is encountered. + */ +struct cuda_error : public std::runtime_error { + cuda_error(std::string const& message) : std::runtime_error(message) {} +}; + +} // namespace raft + +#define STRINGIFY_DETAIL(x) #x +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) + +/** + * @brief Macro for checking (pre-)conditions that throws an exception when + * a condition is violated. + * + * @param[in] cond Expression that evaluates to true or false + * @param[in] reason String literal description of the reason that cond is + * expected to be true + * @throw raft::logic_error if the condition evaluates to false. + **/ +#define RAFT_EXPECTS(cond, reason) \ + (!!(cond)) ? static_cast(0) \ + : throw raft::logic_error("RAFT failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * In host code, throws a `raft::logic_error`. + * + * @param[in] reason String literal description of the reason + **/ +#define RAFT_FAIL(reason) \ + throw raft::logic_error("RAFT failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) + +namespace raft { +namespace detail { + +inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) +{ + throw raft::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); +} + +} // namespace detail +} // namespace raft + +/** + * @brief Error checking macro for CUDA runtime API functions. + * + * Invokes a CUDA runtime API function call, if the call does not return + * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an + * exception detailing the CUDA error that occurred + * + **/ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = (call); \ + if (cudaSuccess != status) { \ + cudaGetLastError(); \ + raft::detail::throw_cuda_error(status, __FILE__, __LINE__); \ + } \ + } while (0); + +/** + * @brief Debug macro to check for CUDA errors + * + * In a non-release build, this macro will synchronize the specified stream + * before error checking. In both release and non-release builds, this macro + * checks for any pending CUDA errors from previous calls. If an error is + * reported, an exception is thrown detailing the CUDA error that occurred. + * + * The intent of this macro is to provide a mechanism for synchronous and + * deterministic execution for debugging asynchronous CUDA execution. It should + * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an + * asynchronous kernel launch. + * + **/ +#ifndef NDEBUG +#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); +#else +#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); +#endif From 328462f58eb02098e4803ebf3225a4a5fa45aec4 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 11:25:46 -0400 Subject: [PATCH 054/189] add CUML_EXPECTS, CUML_FAIL, CUGRAPH_EXPECTS, and CUGRAPH_FAIL --- cpp/include/raft/error.hpp | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index a75d98cf4d..1bd74ec3c2 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -73,6 +73,54 @@ struct cuda_error : public std::runtime_error { #define RAFT_FAIL(reason) \ throw raft::logic_error("RAFT failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) +/** + * @brief Macro for checking (pre-)conditions that throws an exception when + * a condition is violated. + * + * @param[in] cond Expression that evaluates to true or false + * @param[in] reason String literal description of the reason that cond is + * expected to be true + * @throw raft::logic_error if the condition evaluates to false. + **/ +#define CUML_EXPECTS(cond, reason) \ + (!!(cond)) ? static_cast(0) \ + : throw raft::logic_error("cuML failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * In host code, throws a `raft::logic_error`. + * + * @param[in] reason String literal description of the reason + **/ +#define CUML_FAIL(reason) \ + throw raft::logic_error("cuML failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) + +/** + * @brief Macro for checking (pre-)conditions that throws an exception when + * a condition is violated. + * + * @param[in] cond Expression that evaluates to true or false + * @param[in] reason String literal description of the reason that cond is + * expected to be true + * @throw raft::logic_error if the condition evaluates to false. + **/ +#define CUGRAPH_EXPECTS(cond, reason) \ + (!!(cond)) ? static_cast(0) \ + : throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * In host code, throws a `raft::logic_error`. + * + * @param[in] reason String literal description of the reason + **/ +#define CUGRAPH_FAIL(reason) \ + throw raft::logic_error("cuGRAPH failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) + namespace raft { namespace detail { From 187e12ab3a6b52f5106128afc52715eb3a8affe5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 11:33:19 -0400 Subject: [PATCH 055/189] add NCCL_TRY --- cpp/include/raft/error.hpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 1bd74ec3c2..4821a9761d 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -33,7 +35,6 @@ namespace raft { */ struct logic_error : public std::logic_error { logic_error(char const* const message) : std::logic_error(message) {} - logic_error(std::string const& message) : std::logic_error(message) {} }; @@ -41,9 +42,18 @@ struct logic_error : public std::logic_error { * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public std::runtime_error { + cuda_error(char const* const message) : std::runtime_error(message) {} cuda_error(std::string const& message) : std::runtime_error(message) {} }; +/** + * @brief Exception thrown when a NCCL error is encountered. + */ +struct nccl_error : public std::runtime_error { + nccl_error(char const* const message) : std::runtime_error(message) {} + nccl_error(std::string const& message) : std::runtime_error(message) {} +}; + } // namespace raft #define STRINGIFY_DETAIL(x) #x @@ -131,6 +141,13 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); } +inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int line) { + throw cugraph::nccl_error( + std::string{"NCCL error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + ncclGetErrorString(error)}); +} + } // namespace detail } // namespace raft @@ -170,3 +187,17 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l #else #define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); #endif + +/** + * @brief Error checking macro for NCCL runtime API functions. + * + * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an + * exception detailing the NCCL error that occurred + */ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + cugraph::detail::throw_nccl_error(status, __FILE__, __LINE__);\ + } \ + } while (0); From 4ce8f372b406d0ba23a9dcb153d06a4fae6d4594 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 12:12:13 -0400 Subject: [PATCH 056/189] fix compile/clang-tidy errors --- cpp/include/raft/error.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 4821a9761d..9d32943fb5 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -34,24 +34,24 @@ namespace raft { * */ struct logic_error : public std::logic_error { - logic_error(char const* const message) : std::logic_error(message) {} - logic_error(std::string const& message) : std::logic_error(message) {} + explicit logic_error(char const* const message) : std::logic_error(message) {} + explicit logic_error(std::string const& message) : std::logic_error(message) {} }; /** * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public std::runtime_error { - cuda_error(char const* const message) : std::runtime_error(message) {} - cuda_error(std::string const& message) : std::runtime_error(message) {} + explicit cuda_error(char const* const message) : std::runtime_error(message) {} + explicit cuda_error(std::string const& message) : std::runtime_error(message) {} }; /** * @brief Exception thrown when a NCCL error is encountered. */ struct nccl_error : public std::runtime_error { - nccl_error(char const* const message) : std::runtime_error(message) {} - nccl_error(std::string const& message) : std::runtime_error(message) {} + explicit nccl_error(char const* const message) : std::runtime_error(message) {} + explicit nccl_error(std::string const& message) : std::runtime_error(message) {} }; } // namespace raft @@ -142,7 +142,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l } inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int line) { - throw cugraph::nccl_error( + throw raft::nccl_error( std::string{"NCCL error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + ncclGetErrorString(error)}); @@ -198,6 +198,6 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int do { \ ncclResult_t const status = (call); \ if (ncclSuccess != status) { \ - cugraph::detail::throw_nccl_error(status, __FILE__, __LINE__);\ + raft::detail::throw_nccl_error(status, __FILE__, __LINE__);\ } \ } while (0); From 086abd3f80d9dd8e2453b83591a056c0fc3e26fd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 12:31:38 -0400 Subject: [PATCH 057/189] fix an error in a comment --- cpp/include/raft/error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 9d32943fb5..5c083b9f6e 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -30,7 +30,7 @@ namespace raft { * @brief Exception thrown when logical precondition is violated. * * This exception should not be thrown directly and is instead thrown by the - * RAFT_EXPECTS macro. + * RAFT_EXPECTS, RAFT_FAIL, CUML_EXPECTS, CUML_FAIL, CUGRAPH_EXPECTS, CUGRAPH_FAIL macros. * */ struct logic_error : public std::logic_error { From 4f7225773633e83f690b0ba1def7e324e4e54a92 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 13:07:28 -0400 Subject: [PATCH 058/189] add CUSPARSE_TRY --- cpp/include/raft/error.hpp | 82 +++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 5c083b9f6e..fd602d57fd 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -46,6 +47,14 @@ struct cuda_error : public std::runtime_error { explicit cuda_error(std::string const& message) : std::runtime_error(message) {} }; +/** + * @brief Exception thrown when a cuSparse error is encountered. + */ +struct cusparse_error : public std::runtime_error { + explicit cusparse_error(char const* const message) : std::runtime_error(message) {} + explicit cusparse_error(std::string const& message) : std::runtime_error(message) {} +}; + /** * @brief Exception thrown when a NCCL error is encountered. */ @@ -67,7 +76,7 @@ struct nccl_error : public std::runtime_error { * @param[in] reason String literal description of the reason that cond is * expected to be true * @throw raft::logic_error if the condition evaluates to false. - **/ + */ #define RAFT_EXPECTS(cond, reason) \ (!!(cond)) ? static_cast(0) \ : throw raft::logic_error("RAFT failure at: " __FILE__ \ @@ -79,7 +88,7 @@ struct nccl_error : public std::runtime_error { * In host code, throws a `raft::logic_error`. * * @param[in] reason String literal description of the reason - **/ + */ #define RAFT_FAIL(reason) \ throw raft::logic_error("RAFT failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) @@ -91,7 +100,7 @@ struct nccl_error : public std::runtime_error { * @param[in] reason String literal description of the reason that cond is * expected to be true * @throw raft::logic_error if the condition evaluates to false. - **/ + */ #define CUML_EXPECTS(cond, reason) \ (!!(cond)) ? static_cast(0) \ : throw raft::logic_error("cuML failure at: " __FILE__ \ @@ -103,7 +112,7 @@ struct nccl_error : public std::runtime_error { * In host code, throws a `raft::logic_error`. * * @param[in] reason String literal description of the reason - **/ + */ #define CUML_FAIL(reason) \ throw raft::logic_error("cuML failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) @@ -115,7 +124,7 @@ struct nccl_error : public std::runtime_error { * @param[in] reason String literal description of the reason that cond is * expected to be true * @throw raft::logic_error if the condition evaluates to false. - **/ + */ #define CUGRAPH_EXPECTS(cond, reason) \ (!!(cond)) ? static_cast(0) \ : throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ @@ -127,7 +136,7 @@ struct nccl_error : public std::runtime_error { * In host code, throws a `raft::logic_error`. * * @param[in] reason String literal description of the reason - **/ + */ #define CUGRAPH_FAIL(reason) \ throw raft::logic_error("cuGRAPH failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) @@ -148,6 +157,37 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int ncclGetErrorString(error)}); } +// FIXME: unnecessary once CUDA 10.1+ becomes the minimum supported version +#define _CUSPARSE_ERR_TO_STR(err) \ + case err: \ + return #err; +inline const char* cusparse_error_to_string(cusparseStatus_t err) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 + return cusparseGetErrorString(status); +#else // CUDART_VERSION + switch (err) { + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); + _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + default: + return "CUSPARSE_STATUS_UNKNOWN"; + }; +#endif // CUDART_VERSION +} +#undef _CUSPARSE_ERR_TO_STR + +inline void throw_cusparse_error(cusparseStatus_t error, const char* file, unsigned int line) { + throw raft::cusparse_error( + std::string{"cuSparse error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cusparse_error_to_string(error)}); +} + } // namespace detail } // namespace raft @@ -158,7 +198,7 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an * exception detailing the CUDA error that occurred * - **/ + */ #define CUDA_TRY(call) \ do { \ cudaError_t const status = (call); \ @@ -181,23 +221,37 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an * asynchronous kernel launch. * - **/ + */ #ifndef NDEBUG #define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); #else #define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); #endif +/** + * @brief Error checking macro for cuSparse runtime API functions. + * + * Invokes a cuSparse runtime API function call, if the call does not return + * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred + */ +#define CUSPARSE_TRY(call) \ + do { \ + cusparseStatus_t const status = (call); \ + if (CUSPARSE_STATUS_SUCCESS != status) { \ + raft::detail::throw_cusparse_error(status, __FILE__, __LINE__); \ + } \ + } while (0); + /** * @brief Error checking macro for NCCL runtime API functions. * * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - raft::detail::throw_nccl_error(status, __FILE__, __LINE__);\ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + raft::detail::throw_nccl_error(status, __FILE__, __LINE__); \ + } \ } while (0); From a428c6ec9b690943628b03f051110e702ca32a96 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 13:34:23 -0400 Subject: [PATCH 059/189] add CURAND_TRY --- cpp/include/raft/error.hpp | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index fd602d57fd..7861157507 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -47,6 +48,14 @@ struct cuda_error : public std::runtime_error { explicit cuda_error(std::string const& message) : std::runtime_error(message) {} }; +/** + * @brief Exception thrown when a cuRAND error is encountered. + */ +struct curand_error : public std::runtime_error { + explicit curand_error(char const* const message) : std::runtime_error(message) {} + explicit curand_error(std::string const& message) : std::runtime_error(message) {} +}; + /** * @brief Exception thrown when a cuSparse error is encountered. */ @@ -157,6 +166,37 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int ncclGetErrorString(error)}); } +#define _CURAND_ERR_TO_STR(err) \ + case err: \ + return #err; +inline const char* curand_error_to_string(curandStatus_t err) { + switch(err) { + _CURAND_ERR_TO_STR(CURAND_STATUS_SUCCESS); + _CURAND_ERR_TO_STR(CURAND_STATUS_VERSION_MISMATCH); + _CURAND_ERR_TO_STR(CURAND_STATUS_NOT_INITIALIZED); + _CURAND_ERR_TO_STR(CURAND_STATUS_ALLOCATION_FAILED); + _CURAND_ERR_TO_STR(CURAND_STATUS_TYPE_ERROR); + _CURAND_ERR_TO_STR(CURAND_STATUS_OUT_OF_RANGE); + _CURAND_ERR_TO_STR(CURAND_STATUS_LENGTH_NOT_MULTIPLE); + _CURAND_ERR_TO_STR(CURAND_STATUS_DOUBLE_PRECISION_REQUIRED); + _CURAND_ERR_TO_STR(CURAND_STATUS_LAUNCH_FAILURE); + _CURAND_ERR_TO_STR(CURAND_STATUS_PREEXISTING_FAILURE); + _CURAND_ERR_TO_STR(CURAND_STATUS_INITIALIZATION_FAILED); + _CURAND_ERR_TO_STR(CURAND_STATUS_ARCH_MISMATCH); + _CURAND_ERR_TO_STR(CURAND_STATUS_INTERNAL_ERROR); + default: + return "CURAND_STATUS_UNKNOWN"; + }; +} +#undef _CURAND_ERR_TO_STR + +inline void throw_curand_error(curandStatus_t error, const char* file, unsigned int line) { + throw raft::curand_error( + std::string{"cuRAND error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + curand_error_to_string(error)}); +} + // FIXME: unnecessary once CUDA 10.1+ becomes the minimum supported version #define _CUSPARSE_ERR_TO_STR(err) \ case err: \ @@ -228,6 +268,20 @@ inline void throw_cusparse_error(cusparseStatus_t error, const char* file, unsig #define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); #endif +/** + * @brief Error checking macro for cuRAND runtime API functions. + * + * Invokes a cuRAND runtime API function call, if the call does not return + * CURAND_STATUS_SUCCESS, throws an exception detailing the cuRAND error that occurred + */ +#define CURAND_TRY(call) \ + do { \ + curandStatus_t const status = (call); \ + if (CURAND_STATUS_SUCCESS != status) { \ + raft::detail::throw_curand_error(status, __FILE__, __LINE__); \ + } \ + } while (0); + /** * @brief Error checking macro for cuSparse runtime API functions. * From b9cee2b75617868cd34761add08b1feeec83ed4a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 13:36:21 -0400 Subject: [PATCH 060/189] address clang-tidy warnings --- cpp/include/raft/error.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 7861157507..9430b6efc8 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -169,7 +169,7 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int #define _CURAND_ERR_TO_STR(err) \ case err: \ return #err; -inline const char* curand_error_to_string(curandStatus_t err) { +inline auto curand_error_to_string(curandStatus_t err) -> const char* { switch(err) { _CURAND_ERR_TO_STR(CURAND_STATUS_SUCCESS); _CURAND_ERR_TO_STR(CURAND_STATUS_VERSION_MISMATCH); @@ -201,7 +201,7 @@ inline void throw_curand_error(curandStatus_t error, const char* file, unsigned #define _CUSPARSE_ERR_TO_STR(err) \ case err: \ return #err; -inline const char* cusparse_error_to_string(cusparseStatus_t err) { +inline auto cusparse_error_to_string(cusparseStatus_t err) -> const char* { #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(status); #else // CUDART_VERSION From b373267e304576583a0fcc6fe90cfe4304b55afa Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 13:38:16 -0400 Subject: [PATCH 061/189] update change log --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9490099450..190fdc4e9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # RAFT 0.15.0 (Date TBD) ## New Features +- PR #15: add exception based error handling macros ## Improvements From e471f1d8f72375f36629010cebbc0fda821a7f6d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 13:48:13 -0400 Subject: [PATCH 062/189] clang-format fixes --- cpp/include/raft/error.hpp | 96 ++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 9430b6efc8..6952e31f0a 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -25,7 +25,6 @@ #include #include - namespace raft { /** @@ -37,39 +36,48 @@ namespace raft { */ struct logic_error : public std::logic_error { explicit logic_error(char const* const message) : std::logic_error(message) {} - explicit logic_error(std::string const& message) : std::logic_error(message) {} + explicit logic_error(std::string const& message) + : std::logic_error(message) {} }; /** * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public std::runtime_error { - explicit cuda_error(char const* const message) : std::runtime_error(message) {} - explicit cuda_error(std::string const& message) : std::runtime_error(message) {} + explicit cuda_error(char const* const message) + : std::runtime_error(message) {} + explicit cuda_error(std::string const& message) + : std::runtime_error(message) {} }; /** * @brief Exception thrown when a cuRAND error is encountered. */ struct curand_error : public std::runtime_error { - explicit curand_error(char const* const message) : std::runtime_error(message) {} - explicit curand_error(std::string const& message) : std::runtime_error(message) {} + explicit curand_error(char const* const message) + : std::runtime_error(message) {} + explicit curand_error(std::string const& message) + : std::runtime_error(message) {} }; /** * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public std::runtime_error { - explicit cusparse_error(char const* const message) : std::runtime_error(message) {} - explicit cusparse_error(std::string const& message) : std::runtime_error(message) {} + explicit cusparse_error(char const* const message) + : std::runtime_error(message) {} + explicit cusparse_error(std::string const& message) + : std::runtime_error(message) {} }; /** * @brief Exception thrown when a NCCL error is encountered. */ struct nccl_error : public std::runtime_error { - explicit nccl_error(char const* const message) : std::runtime_error(message) {} - explicit nccl_error(std::string const& message) : std::runtime_error(message) {} + explicit nccl_error(char const* const message) + : std::runtime_error(message) {} + explicit nccl_error(std::string const& message) + : std::runtime_error(message) {} }; } // namespace raft @@ -86,10 +94,11 @@ struct nccl_error : public std::runtime_error { * expected to be true * @throw raft::logic_error if the condition evaluates to false. */ -#define RAFT_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw raft::logic_error("RAFT failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define RAFT_EXPECTS(cond, reason) \ + (!!(cond)) \ + ? static_cast(0) \ + : throw raft::logic_error("RAFT failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) /** * @brief Indicates that an erroneous code path has been taken. @@ -98,8 +107,9 @@ struct nccl_error : public std::runtime_error { * * @param[in] reason String literal description of the reason */ -#define RAFT_FAIL(reason) \ - throw raft::logic_error("RAFT failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define RAFT_FAIL(reason) \ + throw raft::logic_error("RAFT failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) /** * @brief Macro for checking (pre-)conditions that throws an exception when @@ -110,10 +120,11 @@ struct nccl_error : public std::runtime_error { * expected to be true * @throw raft::logic_error if the condition evaluates to false. */ -#define CUML_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw raft::logic_error("cuML failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define CUML_EXPECTS(cond, reason) \ + (!!(cond)) \ + ? static_cast(0) \ + : throw raft::logic_error("cuML failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) /** * @brief Indicates that an erroneous code path has been taken. @@ -122,8 +133,9 @@ struct nccl_error : public std::runtime_error { * * @param[in] reason String literal description of the reason */ -#define CUML_FAIL(reason) \ - throw raft::logic_error("cuML failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define CUML_FAIL(reason) \ + throw raft::logic_error("cuML failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) /** * @brief Macro for checking (pre-)conditions that throws an exception when @@ -134,10 +146,11 @@ struct nccl_error : public std::runtime_error { * expected to be true * @throw raft::logic_error if the condition evaluates to false. */ -#define CUGRAPH_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define CUGRAPH_EXPECTS(cond, reason) \ + (!!(cond)) \ + ? static_cast(0) \ + : throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) /** * @brief Indicates that an erroneous code path has been taken. @@ -146,20 +159,23 @@ struct nccl_error : public std::runtime_error { * * @param[in] reason String literal description of the reason */ -#define CUGRAPH_FAIL(reason) \ - throw raft::logic_error("cuGRAPH failure at: " __FILE__ ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define CUGRAPH_FAIL(reason) \ + throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ + ":" RAFT_STRINGIFY(__LINE__) ": " reason) namespace raft { namespace detail { -inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) -{ - throw raft::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); +inline void throw_cuda_error(cudaError_t error, const char* file, + unsigned int line) { + throw raft::cuda_error( + std::string{"CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); } -inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int line) { +inline void throw_nccl_error(ncclResult_t error, const char* file, + unsigned int line) { throw raft::nccl_error( std::string{"NCCL error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + @@ -167,10 +183,10 @@ inline void throw_nccl_error(ncclResult_t error, const char* file, unsigned int } #define _CURAND_ERR_TO_STR(err) \ - case err: \ + case err: \ return #err; inline auto curand_error_to_string(curandStatus_t err) -> const char* { - switch(err) { + switch (err) { _CURAND_ERR_TO_STR(CURAND_STATUS_SUCCESS); _CURAND_ERR_TO_STR(CURAND_STATUS_VERSION_MISMATCH); _CURAND_ERR_TO_STR(CURAND_STATUS_NOT_INITIALIZED); @@ -190,7 +206,8 @@ inline auto curand_error_to_string(curandStatus_t err) -> const char* { } #undef _CURAND_ERR_TO_STR -inline void throw_curand_error(curandStatus_t error, const char* file, unsigned int line) { +inline void throw_curand_error(curandStatus_t error, const char* file, + unsigned int line) { throw raft::curand_error( std::string{"cuRAND error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + @@ -204,7 +221,7 @@ inline void throw_curand_error(curandStatus_t error, const char* file, unsigned inline auto cusparse_error_to_string(cusparseStatus_t err) -> const char* { #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(status); -#else // CUDART_VERSION +#else // CUDART_VERSION switch (err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED); @@ -221,7 +238,8 @@ inline auto cusparse_error_to_string(cusparseStatus_t err) -> const char* { } #undef _CUSPARSE_ERR_TO_STR -inline void throw_cusparse_error(cusparseStatus_t error, const char* file, unsigned int line) { +inline void throw_cusparse_error(cusparseStatus_t error, const char* file, + unsigned int line) { throw raft::cusparse_error( std::string{"cuSparse error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + From 035dc0046734db15d129cf6c7dabad8c0f9769b1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 3 Jun 2020 14:02:14 -0400 Subject: [PATCH 063/189] another try to make clang-format happy --- cpp/include/raft/error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 6952e31f0a..9424d28001 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -221,7 +221,7 @@ inline void throw_curand_error(curandStatus_t error, const char* file, inline auto cusparse_error_to_string(cusparseStatus_t err) -> const char* { #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(status); -#else // CUDART_VERSION +#else // CUDART_VERSION switch (err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED); From cc4be83737c558bcd0e6edadfdac1bdeb9d1f9fc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 13:15:16 -0500 Subject: [PATCH 064/189] Clean-up of matrix definitions and dummy error handling. --- cpp/include/raft/spectral/error_temp.hpp | 15 ++ cpp/include/raft/spectral/spectral_matrix.hpp | 243 ++++-------------- 2 files changed, 65 insertions(+), 193 deletions(-) create mode 100644 cpp/include/raft/spectral/error_temp.hpp diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp new file mode 100644 index 0000000000..0cd58b1769 --- /dev/null +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -0,0 +1,15 @@ +#pragma once + +#define STRINGIFY_DETAIL(x) #x +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) + + +#define RAFT_EXPECT(cond, reason) + +#define RAFT_TRY(error_expression) + +#define RAFT_FAIL(reason) + +#define CUDA_TRY(call) + +#define CUDA_CHECK_LAST() diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index c77bb8e5a0..86ef0ec89c 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -22,6 +22,21 @@ // #include #include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DEBUG +#include +#include +#endif + +#include "error_temp.hpp" // TODO: replace w/ actual error handling to be brought in soon // CUDA block size #define BLOCK_SIZE 1024 @@ -33,7 +48,7 @@ namespace raft { namespace matrix { void check_size(size_t sz) { - if (sz > INT_MAX) FatalError("Vector larger than INT_MAX", ERR_BAD_PARAMETERS); + RAFT_EXPECT( sz <= INT_MAX, "Vector larger than INT_MAX"); } template void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) @@ -56,12 +71,12 @@ namespace matrix { { #ifdef DEBUG thrust::device_ptr dev_ptr(vec); - COUT().precision(15); - COUT() << "sample size = " << n << ", offset = " << offset << std::endl; + std::cout<(std::cout, " ")); cudaCheckError(); - COUT() << std::endl; + std::cout << std::endl; #endif } @@ -131,18 +146,18 @@ namespace matrix { ValueType_* y, cudaStream_t stream) { + RAFT_EXPECT((alpha == 1.0) && ((beta == 0.0) || (beta == 1.0)), "Not implemented case of y = D*x"); + int items_per_thread = 4; int num_threads = 128; int max_grid_size = 4096; check_size(num_vertices); int n = static_cast(num_vertices); int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - if (alpha == 1.0 && beta == 0.0) + if (beta == 0.0) dmv0_kernel<<>>(D, x, y, n); - else if (alpha == 1.0 && beta == 1.0) + else if (beta == 1.0) dmv1_kernel<<>>(D, x, y, n); - else - FatalError("Not implemented case of y = D*x", ERR_BAD_PARAMETERS); cudaCheckError(); } @@ -190,7 +205,7 @@ namespace matrix { void allocate(size_t n, cudaStream_t stream = 0) { - values.resize(n); + values.resize(n);//TODO: delegate to outer alocator! } void fill(ValueType val, cudaStream_t stream = 0) @@ -200,16 +215,15 @@ namespace matrix { void copy(Vector &vec1, cudaStream_t stream = 0) { + RAFT_EXPECT( (get_size() == 0 && vec1.get_size()>0) || (get_size() >= vec1.get_size()) ); if (this->get_size() == 0 && vec1.get_size()>0) { allocate(vec1.get_size(), stream); copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); } else if (this->get_size() == vec1.get_size()) copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - else if (this->get_size() > vec1.get_size()) { + else // if (this->get_size() > vec1.get_size()) { copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); - } else { - FatalError("Cannot copy a vector into a smaller one", ERR_BAD_PARAMETERS); - } + } } ValueType nrm1(cudaStream_t stream = 0) { @@ -273,49 +287,6 @@ namespace matrix { virtual ValueType_ getEdgeSum() const = 0; }; - /// Dense matrix class - template - class DenseMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Matrix entries, stored column-major in device memory - const ValueType_ * A; - /// Leading dimension of matrix entry array - const IndexType_ lda; - - public: - /// Constructor - DenseMatrix(bool _trans, - IndexType_ _m, IndexType_ _n, - const ValueType_ * _A, IndexType_ _lda); - - /// Destructor - virtual ~DenseMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - /// Sparse matrix class in CSR format template class CsrMatrix : public Matrix { @@ -516,111 +487,6 @@ static __global__ void diagmm(IndexType_ n, } } // namespace -// ============================================= -// Dense matrix class -// ============================================= - -/// Constructor for dense matrix class -/** @param _trans Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _A (Input, device memory, _m*_n entries) Matrix - * entries, stored column-major. - * @param _lda Leading dimension of _A. - */ -template -DenseMatrix::DenseMatrix( - bool _trans, IndexType_ _m, IndexType_ _n, const ValueType_ *_A, IndexType_ _lda) - : Matrix(_m, _n), trans(_trans), A(_A), lda(_lda) -{ - Cublas::set_pointer_mode_host(); - if (_lda < _m) FatalError("invalid dense matrix parameter (lda -DenseMatrix::~DenseMatrix() -{ -} - -/// Get and Set CUDA stream -template -void DenseMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("DenseMatrix setCUDAStream stream=%p\n",this->s); - Cublas::setStream(_s); -} -template -void DenseMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUBLAS(cublasGetStream(cublasHandle, _s)); -} - -/// Matrix-vector product for dense matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void DenseMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemv(this->trans, this->m, this->n, &alpha, this->A, this->lda, x, 1, &beta, y, 1); -} - -template -void DenseMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemm( - this->trans, false, this->m, k, this->n, &alpha, A, lda, x, this->m, &beta, y, this->n); -} - -/// Color and Reorder -template -void DenseMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void DenseMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void DenseMatrix::prec_setup(Matrix *_M) -{ - printf("ERROR: DenseMatrix prec_setup dispacthed\n"); - // exit(1); -} - -template -void DenseMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - printf("ERROR: DenseMatrix prec_solve dispacthed\n"); - // exit(1); -} - -template -ValueType_ DenseMatrix::getEdgeSum() const -{ - return 0.0; -} - // ============================================= // CSR matrix class // ============================================= @@ -657,7 +523,7 @@ CsrMatrix::CsrMatrix(bool _trans, csrRowPtrA(_csrRowPtrA), csrColIndA(_csrColIndA) { - if (nnz < 0) FatalError("invalid CSR matrix parameter (nnz<0)", NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(nnz >= 0, "invalid CSR matrix parameter (nnz<0)"); Cusparse::set_pointer_mode_host(); } @@ -857,19 +723,17 @@ LaplacianMatrix::LaplacianMatrix( : Matrix(_A.m, _A.n), A(&_A) { // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Laplacian matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(_A.m == _A.n, "cannot construct Laplacian matrix from non-square adjacency matrix"); // set CUDA stream - this->s = NULL; + this->s = nullptr; // Construct degree matrix D.allocate(_A.m, this->s); Vector ones(this->n, this->s); ones.fill(1.0); _A.mv(1, ones.raw(), 0, D.raw()); - // Set preconditioning matrix pointer to NULL - M = NULL; + // Set preconditioning matrix pointer to nullptr + M = nullptr; } /// Destructor for Laplacian matrix class @@ -885,7 +749,7 @@ void LaplacianMatrix::setCUDAStream(cudaStream_t _s) this->s = _s; // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } + if (M != nullptr) { M->setCUDAStream(_s); } } template void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) @@ -1004,9 +868,9 @@ void LaplacianMatrix::prec_setup(Matrixprec_setup(NULL); + M->prec_setup(nullptr); } } @@ -1016,7 +880,7 @@ void LaplacianMatrix::prec_solve(IndexType_ k, ValueType_ *__restrict__ fx, ValueType_ *__restrict__ t) const { - if (M != NULL) { + if (M != nullptr) { // preconditioning M->prec_solve(k, alpha, fx, t); } @@ -1040,12 +904,10 @@ ModularityMatrix::ModularityMatrix( : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) { // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Modularity matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(_A.m == _A.n, "cannot construct Modularity matrix from non-square adjacency matrix"); // set CUDA stream - this->s = NULL; + this->s = nullptr; // Construct degree matrix D.allocate(_A.m, this->s); Vector ones(this->n, this->s); @@ -1054,8 +916,8 @@ ModularityMatrix::ModularityMatrix( // D.dump(0,this->n); edge_sum = D.nrm1(); - // Set preconditioning matrix pointer to NULL - M = NULL; + // Set preconditioning matrix pointer to nullptr + M = nullptr; } /// Destructor for Modularity matrix class @@ -1071,7 +933,7 @@ void ModularityMatrix::setCUDAStream(cudaStream_t _s) this->s = _s; // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } + if (M != nullptr) { M->setCUDAStream(_s); } } template @@ -1096,9 +958,7 @@ void ModularityMatrix::mv(ValueType_ alpha, ValueType_ *__restrict__ y) const { // Scale result vector - if (alpha != 1 || beta != 0) - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_EXPECT(alpha == 1 && beta == 0, "cannot construct Modularity matrix from non-square adjacency matrix"); // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, // double *result)); @@ -1125,7 +985,7 @@ void ModularityMatrix::mm(IndexType_ k, ValueType_ beta, ValueType_ *__restrict__ y) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } template @@ -1135,20 +995,20 @@ void ModularityMatrix::dm(IndexType_ k, ValueType_ beta, ValueType_ *__restrict__ y) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } /// Color and Reorder template void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } template void ModularityMatrix::reorder(IndexType_ *p) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } /// Solve preconditioned system M x = f for a set of k vectors @@ -1157,9 +1017,9 @@ void ModularityMatrix::prec_setup(Matrixprec_setup(NULL); + M->prec_setup(nullptr); } } @@ -1169,10 +1029,7 @@ void ModularityMatrix::prec_solve(IndexType_ k, ValueType_ *__restrict__ fx, ValueType_ *__restrict__ t) const { - if (M != NULL) { - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); - } + RAFT_EXPECT(M == nullptr, "Functionality not currently supported in Modularity Matrix."); } template From f3e8862d30491386bdf2c3f0814ac42584a35a5d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 13:20:23 -0500 Subject: [PATCH 065/189] Clean-up of some error checking. --- cpp/include/raft/spectral/spectral_matrix.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index 86ef0ec89c..eda7af120c 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -55,7 +55,7 @@ namespace matrix { { thrust::device_ptr dev_ptr(vec); *res = thrust::reduce(dev_ptr, dev_ptr + n); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -63,7 +63,7 @@ namespace matrix { { thrust::device_ptr dev_ptr(vec); thrust::fill(dev_ptr, dev_ptr + n, value); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -75,7 +75,7 @@ namespace matrix { std::cout << "sample size = " << n << ", offset = " << offset << std::endl; thrust::copy( dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(std::cout, " ")); - cudaCheckError(); + CUDA_CHECK_LAST(); std::cout << std::endl; #endif } @@ -120,7 +120,7 @@ namespace matrix { // COUT() << "copy "<< n << " elements" << std::endl; #endif thrust::copy_n(dev_ptr, n, res_ptr); - cudaCheckError(); + CUDA_CHECK_LAST(); // dump_raw_vec (res, n, 0); } @@ -134,7 +134,7 @@ namespace matrix { int n = static_cast(num_vertices); int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); flag_zeroes_kernel<<>>(num_vertices, vec, flags); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -159,7 +159,7 @@ namespace matrix { else if (beta == 1.0) dmv1_kernel<<>>(D, x, y, n); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -172,7 +172,7 @@ namespace matrix { { fill_raw_vec(res, n, unreachable_val); cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); - cudaCheckError(); + CUDA_CHECK_LAST(); } @@ -791,7 +791,7 @@ void LaplacianMatrix::mv(ValueType_ alpha, blockDim.y = 1; blockDim.z = 1; diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - cudaCheckError(); + CUDA_CHECK_LAST(); // Apply adjacency matrix A->mv(-alpha, x, 1, y); @@ -848,7 +848,7 @@ void LaplacianMatrix::dm(IndexType_ k, diagmm <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); } - cudaCheckError(); + CUDA_CHECK_LAST(); } /// Color and Reorder From 56ddbb5d08c51960a7cbdb3328fcf8f9d1a53699 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 16:39:51 -0500 Subject: [PATCH 066/189] Pulling GraphCSRView from cugraph. --- cpp/include/raft/graph.hpp | 575 ++++++++++++++++++ cpp/include/raft/spectral/spectral_matrix.hpp | 2 + 2 files changed, 577 insertions(+) create mode 100644 cpp/include/raft/graph.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp new file mode 100644 index 0000000000..d7b1a2838a --- /dev/null +++ b/cpp/include/raft/graph.hpp @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include + +#include +#include + +namespace cugraph { +namespace experimental { + +enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; + +struct GraphProperties { + bool directed{false}; + bool weighted{false}; + bool multigraph{false}; + bool bipartite{false}; + bool tree{false}; + PropType has_negative_edges{PropType::PROP_UNDEF}; + GraphProperties() = default; +}; + +enum class DegreeDirection { + IN_PLUS_OUT = 0, ///> Compute sum of in and out degree + IN, ///> Compute in degree + OUT, ///> Compute out degree + DEGREE_DIRECTION_COUNT +}; + +/** + * @brief Base class graphs, all but vertices and edges + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphViewBase { + public: + WT *edge_data; ///< edge weight + Comm comm; + + GraphProperties prop; + + VT number_of_vertices; + ET number_of_edges; + + /** + * @brief Fill the identifiers array with the vertex identifiers. + * + * @param[out] identifier Pointer to device memory to store the vertex + * identifiers + */ + void get_vertex_identifiers(VT *identifiers) const; + void set_communicator(Comm &comm_) { comm = comm_; } + + GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : edge_data(edge_data_), + comm(), + prop(), + number_of_vertices(number_of_vertices_), + number_of_edges(number_of_edges_) + { + } + bool has_data(void) const { return edge_data != nullptr; } +}; + +/** + * @brief A graph stored in COO (COOrdinate) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCOOView : public GraphViewBase { + public: + VT *src_indices{nullptr}; ///< rowInd + VT *dst_indices{nullptr}; ///< colInd + + /** + * @brief Computes degree(in, out, in+out) of all the nodes of a Graph + * + * @throws cugraph::logic_error when an error occurs. + * + * @param[out] degree Device array of size V (V is number of vertices) initialized + * to zeros. Will contain the computed degree of every vertex. + * @param[in] direction IN_PLUS_OUT, IN or OUT + */ + void degree(ET *degree, DegreeDirection direction) const; + + /** + * @brief Default constructor + */ + GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing an edge list in a Graph. + * + * GraphCOOView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCOOView( + VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + src_indices(src_indices_), + dst_indices(dst_indices_) + { + } +}; + +/** + * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed + * Sparse Column) format + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCompressedSparseBaseView : public GraphViewBase { + public: + ET *offsets{nullptr}; ///< CSR offsets + VT *indices{nullptr}; ///< CSR indices + + /** + * @brief Fill the identifiers in the array with the source vertex + * identifiers + * + * @param[out] src_indices Pointer to device memory to store the + * source vertex identifiers + */ + void get_source_indices(VT *src_indices) const; + + /** + * @brief Computes degree(in, out, in+out) of all the nodes of a Graph + * + * @throws cugraph::logic_error when an error occurs. + * + * @param[out] degree Device array of size V (V is number of vertices) initialized + * to zeros. Will contain the computed degree of every vertex. + * @param[in] x Integer value indicating type of degree calculation + * 0 : in+out degree + * 1 : in-degree + * 2 : out-degree + */ + void degree(ET *degree, DegreeDirection direction) const; + + /** + * @brief Wrap existing arrays representing adjacency lists in a Graph. + * GraphCSRView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCompressedSparseBaseView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + offsets{offsets_}, + indices{indices_} + { + } +}; + +/** + * @brief A graph stored in CSR (Compressed Sparse Row) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSRView : public GraphCompressedSparseBaseView { + public: + /** + * @brief Default constructor + */ + GraphCSRView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing adjacency lists in a Graph. + * GraphCSRView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSRView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBaseView( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } +}; + +/** + * @brief A graph stored in CSC (Compressed Sparse Column) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSCView : public GraphCompressedSparseBaseView { + public: + /** + * @brief Default constructor + */ + GraphCSCView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. + * GraphCSCView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSCView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBaseView( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } +}; + +/** + * @brief TODO : Change this Take ownership of the provided graph arrays in COO format + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ +template +struct GraphCOOContents { + VT number_of_vertices; + ET number_of_edges; + std::unique_ptr src_indices; + std::unique_ptr dst_indices; + std::unique_ptr edge_data; +}; + +/** + * @brief A constructed graph stored in COO (COOrdinate) format. + * + * This class will src_indices and dst_indicies (until moved) + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCOO { + VT number_of_vertices_; + ET number_of_edges_; + rmm::device_buffer src_indices_{}; ///< rowInd + rmm::device_buffer dst_indices_{}; ///< colInd + rmm::device_buffer edge_data_{}; ///< CSR data + + public: + /** + * @brief Take ownership of the provided graph arrays in COO format + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCOO(VT number_of_vertices, + ET number_of_edges, + bool has_data = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : number_of_vertices_(number_of_vertices), + number_of_edges_(number_of_edges), + src_indices_(sizeof(VT) * number_of_edges, stream, mr), + dst_indices_(sizeof(VT) * number_of_edges, stream, mr), + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + { + } + + GraphCOO(GraphCOOView const &graph, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : number_of_vertices_(graph.number_of_vertices), + number_of_edges_(graph.number_of_edges), + src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), stream, mr), + dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), stream, mr) + { + if (graph.has_data()) { + edge_data_ = + rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; + } + } + + VT number_of_vertices(void) { return number_of_vertices_; } + ET number_of_edges(void) { return number_of_edges_; } + VT *src_indices(void) { return static_cast(src_indices_.data()); } + VT *dst_indices(void) { return static_cast(dst_indices_.data()); } + WT *edge_data(void) { return static_cast(edge_data_.data()); } + + GraphCOOContents release() noexcept + { + VT number_of_vertices = number_of_vertices_; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; + return GraphCOOContents{ + number_of_vertices, + number_of_edges, + std::make_unique(std::move(src_indices_)), + std::make_unique(std::move(dst_indices_)), + std::make_unique(std::move(edge_data_))}; + } + + GraphCOOView view(void) noexcept + { + return GraphCOOView( + src_indices(), dst_indices(), edge_data(), number_of_vertices_, number_of_edges_); + } + + bool has_data(void) { return nullptr != edge_data_.data(); } +}; + +template +struct GraphSparseContents { + VT number_of_vertices; + ET number_of_edges; + std::unique_ptr offsets; + std::unique_ptr indices; + std::unique_ptr edge_data; +}; + +/** + * @brief Base class for constructted graphs stored in CSR (Compressed Sparse Row) format or + * CSC (Compressed Sparse Column) format + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCompressedSparseBase { + VT number_of_vertices_{0}; + ET number_of_edges_{0}; + rmm::device_buffer offsets_{}; ///< CSR offsets + rmm::device_buffer indices_{}; ///< CSR indices + rmm::device_buffer edge_data_{}; ///< CSR data + + bool has_data_{false}; + + public: + /** + * @brief Take ownership of the provided graph arrays in CSR/CSC format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCompressedSparseBase(VT number_of_vertices, + ET number_of_edges, + bool has_data, + cudaStream_t stream, + rmm::mr::device_memory_resource *mr) + : number_of_vertices_(number_of_vertices), + number_of_edges_(number_of_edges), + offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), + indices_(sizeof(VT) * number_of_edges, stream, mr), + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + { + } + + GraphCompressedSparseBase(GraphSparseContents &&contents) + : number_of_vertices_(contents.number_of_vertices), + number_of_edges_(contents.number_of_edges), + offsets_(std::move(*contents.offsets.release())), + indices_(std::move(*contents.indices.release())), + edge_data_(std::move(*contents.edge_data.release())) + { + } + + VT number_of_vertices(void) { return number_of_vertices_; } + ET number_of_edges(void) { return number_of_edges_; } + ET *offsets(void) { return static_cast(offsets_.data()); } + VT *indices(void) { return static_cast(indices_.data()); } + WT *edge_data(void) { return static_cast(edge_data_.data()); } + + GraphSparseContents release() noexcept + { + VT number_of_vertices = number_of_vertices_; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; + return GraphSparseContents{ + number_of_vertices, + number_of_edges, + std::make_unique(std::move(offsets_)), + std::make_unique(std::move(indices_)), + std::make_unique(std::move(edge_data_))}; + } + + bool has_data(void) { return nullptr != edge_data_.data(); } +}; + +/** + * @brief A constructed graph stored in CSR (Compressed Sparse Row) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSR : public GraphCompressedSparseBase { + public: + /** + * @brief Default constructor + */ + GraphCSR() : GraphCompressedSparseBase() {} + + /** + * @brief Take ownership of the provided graph arrays in CSR format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSR(VT number_of_vertices_, + ET number_of_edges_, + bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : GraphCompressedSparseBase( + number_of_vertices_, number_of_edges_, has_data_, stream, mr) + { + } + + GraphCSR(GraphSparseContents &&contents) + : GraphCompressedSparseBase(std::move(contents)) + { + } + + GraphCSRView view(void) noexcept + { + return GraphCSRView(GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); + } +}; + +/** + * @brief A constructed graph stored in CSC (Compressed Sparse Column) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSC : public GraphCompressedSparseBase { + public: + /** + * @brief Default constructor + */ + GraphCSC() : GraphCompressedSparseBase() {} + + /** + * @brief Take ownership of the provided graph arrays in CSR format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSC(VT number_of_vertices_, + ET number_of_edges_, + bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : GraphCompressedSparseBase( + number_of_vertices_, number_of_edges_, has_data_, stream, mr) + { + } + + GraphCSC(GraphSparseContents &&contents) + : GraphCompressedSparseBase(contents) + { + } + + GraphCSCView view(void) noexcept + { + return GraphCSCView(GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); + } +}; + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index eda7af120c..b9186329d3 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -21,6 +21,8 @@ // #include // #include +#include + #include #include #include From aa058ef3aef9ae214445ba24caaf77875ca9fe88 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 13:04:57 -0500 Subject: [PATCH 067/189] Matrix replacements for nvgraph Matrix types. --- cpp/include/raft/graph.hpp | 8 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 76 +++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 cpp/include/raft/spectral/matrix_wrappers.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp index d7b1a2838a..8e72572764 100644 --- a/cpp/include/raft/graph.hpp +++ b/cpp/include/raft/graph.hpp @@ -22,8 +22,8 @@ #include #include -namespace cugraph { -namespace experimental { +namespace raft { +namespace matrix { enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; @@ -571,5 +571,5 @@ class GraphCSC : public GraphCompressedSparseBase { } }; -} // namespace experimental -} // namespace cugraph +} // namespace matrix +} // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp new file mode 100644 index 0000000000..be6b58a8cf --- /dev/null +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace raft{ +namespace matrix { + +using size_type = int; // for now; TODO: move it in appropriate header + +template +struct sparse_matrix_t { + sparse_matrix_t(index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nnz, + index_type const nrows) : + row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + nnz_(nnz) + { + } + + sparse_matrix_t(const GraphCSRView& csr_view): + row_offsets_(csr_view.offsets_), + col_indices_(csr_view.indices_), + values_(csr_view.edge_data_), + nrows_(csr_view.number_of_vertices_), + nnz_(csr_view.number_of_edges_) + { + } + + + virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types + + // y = alpha*A*x + beta*y + // + template + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y, + exe_policy_t&& policy, + cudaStream_t stream = nullptr) const + { + //TODO: call cusparse::csrmv + } + + //private: // maybe not, keep this ASAP ("as simple as possible"); hence, aggregate + + index_type const* row_offsets_; + index_type const* col_indices_; + value_type const* values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + index_type const nrows_; + index_type const nnz_; +}; + +} // namespace matrix +} // namespace raft From 61c669c2c88a03b276fc6c87bf2a11c5e21af938 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 17:30:29 -0500 Subject: [PATCH 068/189] Refactoring of Lanczos algorithms (except AllocatableVector). More error control placeholders. More matrix wrappers. --- cpp/include/raft/spectral/error_temp.hpp | 14 + cpp/include/raft/spectral/lanczos.hpp | 461 +++++++----------- cpp/include/raft/spectral/matrix_wrappers.hpp | 50 +- 3 files changed, 221 insertions(+), 304 deletions(-) diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 0cd58b1769..f8dabf994b 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -13,3 +13,17 @@ #define CUDA_TRY(call) #define CUDA_CHECK_LAST() + +#ifdef DEBUG +#define COUT() (std::cout) +#define CERR() (std::cerr) +#define WARNING(message) \ + do { \ + std::stringstream ss; \ + ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \ + CERR() << ss.str() << std::endl; \ + } while (0) +#else // DEBUG +#define WARNING(message) +#endif + diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index ad49be1c05..2cc9f002d1 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,25 +14,20 @@ * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION +#pragma once +//for cmath: #define _USE_MATH_DEFINES -#include -#include "include/lanczos.hxx" -#include -#include +#include #include #include #include -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_lapack.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/nvgraph_vector_kernels.hxx" +#include +#include + // ========================================================= // Useful macros // ========================================================= @@ -40,10 +35,12 @@ // Get index of matrix entry #define IDX(i, j, lda) ((i) + (j) * (lda)) -namespace nvgraph { +namespace raft { namespace { +using namespace matrix; + // ========================================================= // Helper functions // ========================================================= @@ -75,16 +72,16 @@ namespace { * @return Zero if successful. Otherwise non-zero. */ template -static int performLanczosIteration(const Matrix *A, - IndexType_ *iter, - IndexType_ maxIter, - ValueType_ shift, - ValueType_ tol, - bool reorthogonalize, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) +int performLanczosIteration(sparse_matrix_t const* A, + IndexType_ *iter, + IndexType_ maxIter, + ValueType_ shift, + ValueType_ tol, + bool reorthogonalize, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration @@ -95,7 +92,7 @@ static int performLanczosIteration(const Matrix *A, const ValueType_ negOne = -1; const ValueType_ zero = 0; - IndexType_ n = A->n; + IndexType_ n = A->nrows; // ------------------------------------------------------- // Compute second Lanczos vector @@ -105,7 +102,7 @@ static int performLanczosIteration(const Matrix *A, // Apply matrix if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); @@ -130,7 +127,7 @@ static int performLanczosIteration(const Matrix *A, // Apply matrix if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); @@ -161,7 +158,7 @@ static int performLanczosIteration(const Matrix *A, &one, lanczosVecs_dev + IDX(0, *iter, n), 1); - CHECK_CUDA(cudaMemcpyAsync(alpha_host + (*iter - 1), + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -220,7 +217,7 @@ static int performLanczosIteration(const Matrix *A, Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); } - CHECK_CUDA(cudaDeviceSynchronize()); + CUDA_TRY(cudaDeviceSynchronize()); return 0; } @@ -558,7 +555,7 @@ static int lanczosRestart(IndexType_ n, WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CHECK_CUDA( + CUDA_TRY( cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; @@ -578,11 +575,11 @@ static int lanczosRestart(IndexType_ n, Cublas::gemm( false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); - CHECK_CUDA(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); // Normalize residual to obtain new Lanczos vector - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); @@ -592,7 +589,7 @@ static int lanczosRestart(IndexType_ n, return 0; } -} // namespace +} // anonym. namespace // ========================================================= // Eigensolver @@ -642,24 +639,25 @@ static int lanczosRestart(IndexType_ n, * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *shift, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeSmallestEigenvectors(sparse_matrix_t const* A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *shift, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed) { // ------------------------------------------------------- // Variable declaration @@ -670,7 +668,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->n; + IndexType_ n = A->nrows; // Shift for implicit restart ValueType_ shiftUpper; @@ -697,34 +695,12 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + // ------------------------------------------------------- // Variable initialization @@ -750,15 +726,15 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * // Random number generator curandGenerator_t randGen; // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); // FIXME: This is hard coded, which is good for unit testing... // but should really be a parameter so it could be // "random" for real runs and "fixed" for tests - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 1234567 /*time(NULL)*/)); - // CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); + // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); @@ -877,7 +853,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CHECK_CUDA(cudaMemcpy(eigVals_dev, + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); @@ -885,7 +861,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} - CHECK_CUDA(cudaMemcpy( + CUDA_TRY(cudaMemcpy( work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis @@ -904,8 +880,8 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * n); // Clean up and exit - CHECK_CURAND(curandDestroyGenerator(randGen)); - return NVGRAPH_OK; + CUDA_TRY(curandDestroyGenerator(randGen)); + return 0; } /// Compute smallest eigenvectors of symmetric matrix @@ -942,55 +918,30 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeSmallestEigenvectors(sparse_matrix_t const& A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 1234567, + cudaStream_t stream = 0) { - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - // Matrix dimension - IndexType_ n = A.n; + IndexType_ n = A.nrows; // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -999,27 +950,29 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix & ValueType_ *alpha_host = alpha_host_v.data(); ValueType_ *beta_host = beta_host_v.data(); - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); + //TODO: replace and fix allocation via RAFT handle + AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); + AllocatableVector work_dev((n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - NVGRAPH_ERROR status = computeSmallestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); + int status = computeSmallestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -1068,23 +1021,24 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix & * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeLargestEigenvectors(sparse_matrix_t const* A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed) { // ------------------------------------------------------- // Variable declaration @@ -1095,7 +1049,7 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->n; + IndexType_ n = A->nrows; // Lanczos iteration counters IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system @@ -1118,34 +1072,11 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // ------------------------------------------------------- // Variable initialization @@ -1171,10 +1102,10 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // Random number generator curandGenerator_t randGen; // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456)); + CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); @@ -1296,13 +1227,13 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // Copy results to device memory // skip smallest eigenvalue if needed - CHECK_CUDA(cudaMemcpy(eigVals_dev, + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed - CHECK_CUDA(cudaMemcpy(work_dev, + CUDA_TRY(cudaMemcpy(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); @@ -1323,8 +1254,8 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A n); // Clean up and exit - CHECK_CURAND(curandDestroyGenerator(randGen)); - return NVGRAPH_OK; + CUDA_TRY(curandDestroyGenerator(randGen)); + return 0; } /// Compute largest eigenvectors of symmetric matrix @@ -1361,55 +1292,30 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeLargestEigenvectors(sparse_matrix_t const& A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 123456, + cudaStream_t stream = 0) { - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - // Matrix dimension - IndexType_ n = A.n; + IndexType_ n = A.nrows; // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -1418,70 +1324,31 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A ValueType_ *alpha_host = alpha_host_v.data(); ValueType_ *beta_host = beta_host_v.data(); - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); + //TODO: replace and fix allocation via RAFT handle + AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); + AllocatableVector work_dev((n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; - NVGRAPH_ERROR status = computeLargestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); + int status = computeLargestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -} // namespace nvgraph +} // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index be6b58a8cf..9dc75fdd77 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -22,6 +22,20 @@ namespace raft{ namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header + +// Vector "view"-like aggregate for linear algebra purposes +// +template +struct vector_t { + value_type* buffer_; + size_type size_; + + vector_t(value_type* buffer, size_type sz): + buffer_(buffer), + size_(sz) + { + } +}; template struct sparse_matrix_t { @@ -38,7 +52,7 @@ struct sparse_matrix_t { { } - sparse_matrix_t(const GraphCSRView& csr_view): + sparse_matrix_t(GraphCSRView const& csr_view): row_offsets_(csr_view.offsets_), col_indices_(csr_view.indices_), values_(csr_view.edge_data_), @@ -52,18 +66,15 @@ struct sparse_matrix_t { // y = alpha*A*x + beta*y // - template - void mv(value_type alpha, + virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y, - exe_policy_t&& policy, - cudaStream_t stream = nullptr) const + value_type* __restrict__ y) const { //TODO: call cusparse::csrmv } - //private: // maybe not, keep this ASAP ("as simple as possible"); hence, aggregate + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate index_type const* row_offsets_; index_type const* col_indices_; @@ -72,5 +83,30 @@ struct sparse_matrix_t { index_type const nnz_; }; +template +struct laplacian_matrix_t : sparse_matrix_t { + laplacian_matrix_t(index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + vector_t&& diagonal, + index_type const nnz, + index_type const nrows) : + sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), + diagonal_(diagonal) + { + } + + laplacian_matrix_t(GraphCSRView const& csr_view, vector_t&& diagonal): + sparse_matrix_t(csr_view), + diagonal_(diagonal) + { + } + + vector_t diagonal_; +}; + +template +using modularity_matrix_t = laplacian_matrix_t; // for now; TODO: if it turns out modularity matrix actually behaves differently than Laplacian matrix, this should be made a separate class; + } // namespace matrix } // namespace raft From 9d71cfc76dd9c09adfa1516578d721164f95dbc9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 17:42:21 -0500 Subject: [PATCH 069/189] LAPACK dependencies. --- cpp/include/raft/spectral/lapack.hpp | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 cpp/include/raft/spectral/lapack.hpp diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp new file mode 100644 index 0000000000..430a7d5144 --- /dev/null +++ b/cpp/include/raft/spectral/lapack.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { + +template +class Lapack +{ +private: + Lapack(); + ~Lapack(); +public: + static void check_lapack_enabled(); + + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc); + + // special QR for lanczos + static void sterf(int n, T * d, T * e); + static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work); + + // QR + // computes the QR factorization of a general matrix + static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. + //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q + static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void qrf (int n, T *H, T *Q, T *R); + + //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); + +}; + + + +} // namespace raft From 22c9f49ddba2bb2307ee7919cb5a607bc4288037 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 16:40:57 -0500 Subject: [PATCH 070/189] Added allocation functionality via raft handle. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 126 +++++++++++++++--- 1 file changed, 111 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 9dc75fdd77..5949895178 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -16,7 +16,9 @@ #pragma once #include -#include +#include // ? +#include + namespace raft{ namespace matrix { @@ -26,15 +28,67 @@ using size_type = int; // for now; TODO: move it in appropriate header // Vector "view"-like aggregate for linear algebra purposes // template -struct vector_t { +struct vector_view_t { value_type* buffer_; size_type size_; - vector_t(value_type* buffer, size_type sz): + vector_view_t(value_type* buffer, size_type sz): buffer_(buffer), size_(sz) { } + + vector_view_t(vector_view_t&& other): + buffer_(other.buffer_), + size_(other.size_) + { + other.buffer_ = nullptr; + other.size_ = 0; + } + + vector_view_t& operator = (vector_view_t&& other) + { + buffer_ = other.buffer_; + size_ = other.size_; + + other.buffer_ = nullptr; + other.size_ = 0; + } +}; + +// allocatable vector, using raft handle allocator +// +template +class vector_t { + handle_t const& handle_; + value_type* buffer_; + size_type size_; + cudaStream_t stream_; +public: + + vector_t(handle_t const& raft_handle, size_type sz, cudaStream_t stream = 0): + handle_(raft_handle), + buffer_(static_cast(raft_handle.get_device_allocator()->allocate(sz*sizeof(value_type), stream))), + size_(sz), + stream_(stream) + { + } + + virtual ~vector_t(void) + { + handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); + } + + size_type size(void) const + { + return size_; + } + +protected: + value_type* buffer(void) + { + return buffer_; + } }; template @@ -84,29 +138,71 @@ struct sparse_matrix_t { }; template -struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(index_type const* row_offsets, +struct laplacian_matrix_t : sparse_matrix_t, vector_t { + laplacian_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, index_type const* col_indices, value_type const* values, - vector_t&& diagonal, + index_type const nrows, index_type const nnz, - index_type const nrows) : - sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), - diagonal_(diagonal) + cudaStream_t stream = 0) : + sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), + vector_t(raft_handle, nrows, stream) { + auto* v = vector_t::buffer(); } - laplacian_matrix_t(GraphCSRView const& csr_view, vector_t&& diagonal): - sparse_matrix_t(csr_view), - diagonal_(diagonal) + laplacian_matrix_t(handle_t const& raft_handle, + GraphCSRView const& csr_view, + cudaStream_t stream = 0): + sparse_matrix_t(csr_view), + vector_t(raft_handle, csr_view.number_of_vertices_, stream) { } - - vector_t diagonal_; + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y) const override + { + //TODO: call cusparse::csrmv + } }; template -using modularity_matrix_t = laplacian_matrix_t; // for now; TODO: if it turns out modularity matrix actually behaves differently than Laplacian matrix, this should be made a separate class; +struct modularity_matrix_t: laplacian_matrix_t +{ + modularity_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz, + cudaStream_t stream = 0) : + laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) + { + auto* v = vector_t::buffer(); + } + + modularity_matrix_t(handle_t const& raft_handle, + GraphCSRView const& csr_view, + cudaStream_t stream = 0): + laplacian_matrix_t(raft_handle, csr_view, stream) + { + } + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y) const override + { + //TODO: call cusparse::csrmv + } +}; } // namespace matrix } // namespace raft From 0f3617b976601f4061644a93753f1bbbef21f3c9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 17:24:10 -0500 Subject: [PATCH 071/189] Fixed allocator dependencies in Lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 15 +++++++++------ cpp/include/raft/spectral/matrix_wrappers.hpp | 19 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 2cc9f002d1..f83652c157 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -27,6 +27,7 @@ #include #include +#include // ========================================================= // Useful macros @@ -921,7 +922,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A * @return error flag. */ template -int computeSmallestEigenvectors(sparse_matrix_t const& A, +int computeSmallestEigenvectors(handle_t handle, + sparse_matrix_t const& A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -951,8 +953,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const& A ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); - AllocatableVector work_dev((n + restartIter) * restartIter, stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; @@ -1295,7 +1297,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, * @return error flag. */ template -int computeLargestEigenvectors(sparse_matrix_t const& A, +int computeLargestEigenvectors(handle_t handle, + sparse_matrix_t const& A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -1325,8 +1328,8 @@ int computeLargestEigenvectors(sparse_matrix_t const& A, ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); - AllocatableVector work_dev((n + restartIter) * restartIter, stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 5949895178..f3fb509e12 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -74,7 +74,7 @@ class vector_t { { } - virtual ~vector_t(void) + ~vector_t(void) { handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); } @@ -84,8 +84,7 @@ class vector_t { return size_; } -protected: - value_type* buffer(void) + value_type* raw(void) { return buffer_; } @@ -138,7 +137,7 @@ struct sparse_matrix_t { }; template -struct laplacian_matrix_t : sparse_matrix_t, vector_t { +struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, @@ -147,16 +146,17 @@ struct laplacian_matrix_t : sparse_matrix_t, vector_t(row_offsets,col_indices,values,nrows,nnz), - vector_t(raft_handle, nrows, stream) + diagonal_(raft_handle, nrows, stream) { - auto* v = vector_t::buffer(); + auto* v = diagonal_.raw(); + //TODO: more work, here... } laplacian_matrix_t(handle_t const& raft_handle, GraphCSRView const& csr_view, cudaStream_t stream = 0): sparse_matrix_t(csr_view), - vector_t(raft_handle, csr_view.number_of_vertices_, stream) + diagonal_(raft_handle, csr_view.number_of_vertices_, stream) { } @@ -169,6 +169,8 @@ struct laplacian_matrix_t : sparse_matrix_t, vector_t diagonal_; }; template @@ -183,7 +185,8 @@ struct modularity_matrix_t: laplacian_matrix_t cudaStream_t stream = 0) : laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) { - auto* v = vector_t::buffer(); + auto* v = laplacian_matrix_t::diagonal_.raw(); + //TODO: more work, here... } modularity_matrix_t(handle_t const& raft_handle, From ee3102ba4516ceb4cc42531323800b849713d125 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 20:58:58 -0500 Subject: [PATCH 072/189] Lapack dependencies. --- cpp/include/raft/spectral/error_temp.hpp | 2 + cpp/include/raft/spectral/lapack.hpp | 442 +++++++++++++++++++++++ 2 files changed, 444 insertions(+) diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index f8dabf994b..82beb75640 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -8,6 +8,8 @@ #define RAFT_TRY(error_expression) +//assume RAFT_FAIL() can take a std::string `reason` +// #define RAFT_FAIL(reason) #define CUDA_TRY(call) diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 430a7d5144..d86343990d 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -17,9 +17,87 @@ #pragma once #include +#include +#include + +//for now; TODO: check if/where this `define` should be; +// +#define USE_LAPACK namespace raft { +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + RAFT_FAIL(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ + } + + +extern "C" void sgeqrf_( + int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); +extern "C" void dgeqrf_( + int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, + char *trans, + int *m, + int *n, + int *k, + float *a, + int *lda, + const float *tau, + float *c, + int *ldc, + float *work, + int *lwork, + int *info); +extern "C" void dormqr_(char *side, + char *trans, + int *m, + int *n, + int *k, + double *a, + int *lda, + const double *tau, + double *c, + int *ldc, + double *work, + int *lwork, + int *info); +extern "C" int dgeev_(char *jobvl, + char *jobvr, + int *n, + double *a, + int *lda, + double *wr, + double *wi, + double *vl, + int *ldvl, + double *vr, + int *ldvr, + double *work, + int *lwork, + int *info); + +extern "C" int sgeev_(char *jobvl, + char *jobvr, + int *n, + float *a, + int *lda, + float *wr, + float *wi, + float *vl, + int *ldvl, + float *vr, + int *ldvr, + float *work, + int *lwork, + int *info); + + template class Lapack { @@ -50,8 +128,372 @@ class Lapack static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); +private: + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float *a, + int lda, + const float *b, + int ldb, + float beta, + float *c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + } + + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double *a, + int lda, + const double *b, + int ldb, + double beta, + double *c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double *)a, + lda, + (double *)b, + ldb, + &beta, + c, + ldc); + } + + + static void lapack_sterf(int n, float *d, float *e, int *info) + { + cusolverDnSsterfHost(n, d, e, info); + } + + static void lapack_sterf(int n, double *d, double *e, int *info) + { + cusolverDnDsterfHost(n, d, e, info); + } + + static void void lapack_steqr(const signed char compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info) + { + cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_steqr(const signed char compz, int n, double *d, double *e, double *z, int ldz, double *work, int *info) + { + cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info) + { + sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info) + { + dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float *a, + int lda, + float *tau, + float *c, + int ldc, + float *work, + int *lwork, + int *info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + } + + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double *a, + int lda, + double *tau, + double *c, + int ldc, + double *work, + int *lwork, + int *info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + } + + static int lapack_geev_dispatch(char *jobvl, + char *jobvr, + int *n, + double *a, + int *lda, + double *wr, + double *wi, + double *vl, + int *ldvl, + double *vr, + int *ldvr, + double *work, + int *lwork, + int *info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + } + + static int lapack_geev_dispatch(char *jobvl, + char *jobvr, + int *n, + float *a, + int *lda, + float *wr, + float *wi, + float *vl, + int *ldvl, + float *vr, + int *ldvr, + float *work, + int *lwork, + int *info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + } + + // real eigenvalues + static + void lapack_geev(T *A, T *eigenvalues, int dim, int lda) + { + char job = 'N'; + std::vector WI(dim); + int ldv = 1; + T *vl = 0; + int work_size = 6 * dim; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + + // real eigenpairs + static + void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; + std::vector WI(dim); + int work_size = 6 * dim; + T *vl = 0; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + + // complex eigenpairs + static + void lapack_geev(T *A, + T *eigenvalues_r, + T *eigenvalues_i, + T *eigenvectors_r, + T *eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; + int work_size = 8 * dim; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + }; +template +void Lapack::check_lapack_enabled() +{ +#ifndef USE_LAPACK + RAFT_FAIL("Error: LAPACK not enabled."); +#endif +} + +template +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T *A, + int lda, + const T *B, + int ldb, + T beta, + T *C, + int ldc) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + const char transA_char = transa ? 'T' : 'N'; + const char transB_char = transb ? 'T' : 'N'; + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + //#endif +} + +template +void Lapack::sterf(int n, T *d, T *e) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_sterf(n, d, e, &info); + lapackCheckError(info); + //#endif +} + +template +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_steqr(compz, n, d, e, z, ldz, work, &info); + lapackCheckError(info); + //#endif +} +template +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + int info; + lapack_geqrf(m, n, a, lda, tau, work, lwork, &info); + lapackCheckError(info); +#endif +} +template +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T *a, + int lda, + T *tau, + T *c, + int ldc, + T *work, + int *lwork) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + char side = right_side ? 'R' : 'L'; + char trans = transq ? 'T' : 'N'; + int info; + lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); + lapackCheckError(info); +#endif +} + +// real eigenvalues +template +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, dim, lda); +#endif +} +// real eigenpairs +template +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); +#endif +} +// complex eigenpairs +template +void Lapack::geev(T *A, + T *eigenvalues_r, + T *eigenvalues_i, + T *eigenvectors_r, + T *eigenvectors_i, + int dim, + int lda, + int ldvr) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); +#endif +} } // namespace raft From e00501acb44c0cf2561148e4640cde579c71c832 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Mon, 8 Jun 2020 11:16:49 -0500 Subject: [PATCH 073/189] add inline --- CHANGELOG.md | 2 +- cpp/include/raft/comms/comms.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56f676702c..47c1ee0023 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM ## Bug Fixes - +- PR #17: Make destructor inline to avoid redeclaration error # RAFT 0.14.0 (Date TBD) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 367b04f240..1c6e88a8e4 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -323,7 +323,7 @@ class comms_t { std::unique_ptr impl_; }; -comms_iface::~comms_iface() {} +inline comms_iface::~comms_iface() {} } // namespace comms } // namespace raft From 77f78f67f992daabeea540791ebe84367f88e343 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Mon, 8 Jun 2020 13:35:30 -0500 Subject: [PATCH 074/189] remove print --- cpp/include/raft/comms/std_comms.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 1ba7552f9c..77bf21adda 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -96,7 +96,6 @@ static ncclDataType_t get_nccl_datatype(const datatype_t datatype) { case datatype_t::UINT8: return ncclUint8; case datatype_t::INT32: - std::cout << "Returning int32" << std::endl; return ncclInt; case datatype_t::UINT32: return ncclUint32; From 3ee96546473dd8cbdc7306c6849d5c3c0772dd2d Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Tue, 9 Jun 2020 12:13:44 -0500 Subject: [PATCH 075/189] remove destructor --- cpp/include/raft/comms/comms.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 1c6e88a8e4..91ee565f89 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -89,7 +89,6 @@ constexpr datatype_t get_type() { class comms_iface { public: - virtual ~comms_iface(); virtual int get_size() const = 0; virtual int get_rank() const = 0; @@ -323,7 +322,5 @@ class comms_t { std::unique_ptr impl_; }; -inline comms_iface::~comms_iface() {} - } // namespace comms } // namespace raft From aa81662860eb37a85ee39f337b5f17c23e79d8c9 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Tue, 9 Jun 2020 13:10:21 -0500 Subject: [PATCH 076/189] clang formatting --- cpp/include/raft/comms/comms.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 91ee565f89..2770341097 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -89,7 +89,6 @@ constexpr datatype_t get_type() { class comms_iface { public: - virtual int get_size() const = 0; virtual int get_rank() const = 0; From b0c12aaf430b47739566de164d8a0746824c47aa Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 14:50:33 -0500 Subject: [PATCH 077/189] Added missing cusparse API. --- cpp/include/raft/sparse/cusparse_wrappers.h | 168 +++++++++++++++++++- 1 file changed, 166 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1c63d2348b..1853b82b07 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -146,21 +146,185 @@ inline void cusparsecoosortByRow( // NOLINT * @defgroup Gemmi cusparse gemmi operations * @{ */ +template +cusparseStatus_t cusparsegemmi( // NOLINT + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, const T* A, int lda, const T* cscValB, const int* cscColPtrB, const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); +template <> inline cusparseStatus_t cusparsegemmi( cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, float* C, int ldc) { + const int* cscRowIndB, const float* beta, float* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } +template<> inline cusparseStatus_t cusparsegemmi( cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, double* C, int ldc) { + const int* cscRowIndB, const double* beta, double* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } /** @} */ +/** + * @defgroup Csrmv cusparse csrmv operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmv( // NOLINT + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); +} +template <> +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); +} +/** @} */ + +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmm( // NOLINT + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +template <> +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +/** @} */ + +/** + * @defgroup csr2coo cusparse CSR to COO converter methods + * @{ + */ +template +void cusparsecsr2coo( // NOLINT + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); +template <> +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); +} +/** @} */ }; // namespace sparse }; // namespace raft From 001eec838d2ff5adbc49366e168fa765b07d5661 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 15:30:56 -0500 Subject: [PATCH 078/189] Added cusparsesetpointermode. --- cpp/include/raft/sparse/cusparse_wrappers.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1853b82b07..25e7146316 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -325,6 +325,27 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ + +/** + * @defgroup setpointermode cusparse set pointer mode method + * @{ + */ +// no T dependency... +// template +// cusparseStatus_t cusparsesetpointermode( // NOLINT +// cusparseHandle_t handle, +// cusparsePointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, + cusparsePointerMode_t mode, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSetPointerMode(handle, mode); +} +/** @} */ }; // namespace sparse }; // namespace raft From 48e9d09fe418862b4e75e0fd5aa5868255771e7f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 16:04:16 -0500 Subject: [PATCH 079/189] Added setpointer mode to cusparse, cublas and clang-formatted. --- cpp/include/raft/linalg/cublas_wrappers.h | 20 ++ cpp/include/raft/sparse/cusparse_wrappers.h | 203 +++++++------------- 2 files changed, 88 insertions(+), 135 deletions(-) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index cd8a508a84..5b09a792ef 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -542,5 +542,25 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, } /** @} */ +/** + * @defgroup setpointermode cublas set pointer mode method + * @{ + */ +// no T dependency... +// template +// cublasStatus_t cublassetpointermode( // NOLINT +// cublasHandle_t handle, +// cublasPointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, + cublasPointerMode_t mode, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSetPointerMode(handle, mode); +} +/** @} */ + }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 25e7146316..865f93843d 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -148,21 +148,29 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, const T* A, int lda, const T* cscValB, const int* cscColPtrB, const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, + const T* A, int lda, const T* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, - const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, float* C, int ldc, cudaStream_t stream) { +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const float* alpha, + const float* A, int lda, + const float* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const float* beta, + float* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } -template<> -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, - const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, double* C, int ldc, cudaStream_t stream) { +template <> +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const double* alpha, + const double* A, int lda, + const double* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const double* beta, + double* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); @@ -174,60 +182,30 @@ inline cusparseStatus_t cusparsegemmi( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const T* beta, - T* y, - cudaStream_t stream); + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const float* beta, - float* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, + float* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const double* beta, - double* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, const double* x, + const double* beta, double* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); -} + return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); +} /** @} */ /** @@ -236,68 +214,31 @@ inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const int ldx, - const T* beta, - T* y, - const int ldy, - cudaStream_t stream); + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, + const T* beta, T* y, const int ldy, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const int ldx, - const float* beta, - float* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const float* alpha, const cusparseMatDescr_t descr, + const float* csrVal, const int* csrRowPtr, const int* csrColInd, + const float* x, const int ldx, const float* beta, float* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const int ldx, - const double* beta, - double* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const double* alpha, const cusparseMatDescr_t descr, + const double* csrVal, const int* csrRowPtr, const int* csrColInd, + const double* x, const int ldx, const double* beta, double* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ @@ -307,25 +248,18 @@ inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, - const int n, - const int nnz, - const T* csrRowPtr, - T* cooRowInd, - cudaStream_t stream); + cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, + T* cooRowInd, cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, - const int n, - const int nnz, - const int* csrRowPtr, - int* cooRowInd, - cudaStream_t stream) -{ +inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, + const int* csrRowPtr, int* cooRowInd, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, + CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ - + /** * @defgroup setpointermode cusparse set pointer mode method * @{ @@ -340,12 +274,11 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) -{ + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSetPointerMode(handle, mode); + return cusparseSetPointerMode(handle, mode); } - /** @} */ + }; // namespace sparse }; // namespace raft From e311c3e34f975c1bea632e01a96b30d45b1f35b9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 18:59:28 -0500 Subject: [PATCH 080/189] Added sscal to cublas wrappers. --- cpp/include/raft/linalg/cublas_wrappers.h | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 5b09a792ef..5d80b62458 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -562,5 +562,31 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, } /** @} */ +/** + * @defgroup scal cublas dot calls + * @{ + */ +template +cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, + int incx, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const float *alpha, float *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSscal(handle, n, alpha, x, incx); +} + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const double *alpha, double *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDscal(handle, n, alpha, x, incx); +} + +/** @} */ + }; // namespace linalg }; // namespace raft From 022948aab6ef9c27613137a11d5c831680215354 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 20:51:55 -0500 Subject: [PATCH 081/189] Updated cublas calls in Lanczos (partially). --- cpp/include/raft/spectral/lanczos.hpp | 246 +++++++++++++++----------- 1 file changed, 144 insertions(+), 102 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index f83652c157..c375ebbb3e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,9 +25,11 @@ #include #include +#include +#include #include #include -#include + // ========================================================= // Useful macros @@ -73,7 +75,8 @@ using namespace matrix; * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration(sparse_matrix_t const* A, +int performLanczosIteration(handle_t handle, + sparse_matrix_t const* A, IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, @@ -93,8 +96,13 @@ int performLanczosIteration(sparse_matrix_t const* A, const ValueType_ negOne = -1; const ValueType_ zero = 0; - IndexType_ n = A->nrows; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + RAFT_EXPECT( A != nullptr, "Null matrix pointer."); + IndexType_ n = A->nrows; + // ------------------------------------------------------- // Compute second Lanczos vector // ------------------------------------------------------- @@ -103,20 +111,22 @@ int performLanczosIteration(sparse_matrix_t const* A, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - Cublas::dot(n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host); - Cublas::axpy(n, -alpha_host[0], lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1); - beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, 1, n), 1); + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + + auto alpha = -alpha_host[0]; + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[0], lanczosVecs_dev + IDX(0, 1, n), 1); + alpha = 1 / beta_host[0]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -131,91 +141,115 @@ int performLanczosIteration(sparse_matrix_t const* A, CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(ValueType_), - cudaMemcpyDeviceToHost)); - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + work_dev + (*iter - 1), + sizeof(ValueType_), + cudaMemcpyDeviceToHost, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - Cublas::dot(n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1)); - Cublas::axpy(n, - -alpha_host[*iter - 1], - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - Cublas::axpy(n, - -beta_host[*iter - 2], - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); + + auto alpha = -alpha_host[*iter - 1]; + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + alpha = -beta_host[*iter - 2]; + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - beta_host[*iter - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, *iter, n), 1); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; + // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); + alpha = 1 / beta_host[*iter - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaDeviceSynchronize()); @@ -557,10 +591,10 @@ static int lanczosRestart(IndexType_ n, // Obtain new residual CUDA_TRY( - cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); + cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - Cublas::gemv(false, + cublasgemv(false, n, iter, beta_host + iter_new - 1, @@ -573,19 +607,18 @@ static int lanczosRestart(IndexType_ n, 1); // Obtain new Lanczos vectors - Cublas::gemm( + cublasgemm( false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); - beta_host[iter_new - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); - Cublas::scal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + cudaMemcpyDeviceToDevice, stream)); + beta_host[iter_new - 1] = cublasnrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); + cublasscal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); return 0; } @@ -643,7 +676,8 @@ static int lanczosRestart(IndexType_ n, * @return error flag. */ template -int computeSmallestEigenvectors(sparse_matrix_t const* A, +int computeSmallestEigenvectors(handle_t handle, + sparse_matrix_t const* A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -718,7 +752,7 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A work_host = work_host_v.data(); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + cublasset_pointer_mode_host(); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -736,8 +770,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); + cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -749,7 +783,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -773,7 +808,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Obtain tridiagonal matrix with Lanczos *effIter = 0; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -819,7 +855,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -866,7 +903,7 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - Cublas::gemm(false, + cublasgemm(false, false, n, nEigVecs, @@ -959,7 +996,8 @@ int computeSmallestEigenvectors(handle_t handle, // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - int status = computeSmallestEigenvectors(&A, + int status = computeSmallestEigenvectors(handle, + &A, nEigVecs, maxIter, restartIter, @@ -1026,7 +1064,8 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeLargestEigenvectors(sparse_matrix_t const* A, +int computeLargestEigenvectors(handle_t handle, + sparse_matrix_t const* A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -1095,7 +1134,7 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, work_host = work_host_v.data(); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + cublasset_pointer_mode_host(); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1108,8 +1147,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); + cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -1123,7 +1162,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, ValueType_ shift_val = 0.0; ValueType_ *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -1169,7 +1209,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -1241,7 +1282,7 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - Cublas::gemm(false, + cublasgemm(false, false, n, nEigVecs, @@ -1333,7 +1374,8 @@ int computeLargestEigenvectors(handle_t handle, // Perform Lanczos method IndexType_ effIter; - int status = computeLargestEigenvectors(&A, + int status = computeLargestEigenvectors(handle, + &A, nEigVecs, maxIter, restartIter, From 2566b247d30ed829e2f1bb1b33d70edc23c9b08a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 10 Jun 2020 00:13:29 -0400 Subject: [PATCH 082/189] move common error handling utilities from cuda_utils.h to error.hpp --- cpp/include/raft/cudart_utils.h | 79 ++------------------------------- cpp/include/raft/error.hpp | 72 ++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 75 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 47e76ab916..5ae4bcbac2 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -16,89 +16,18 @@ #pragma once +#include "raft/error.hpp" + #include + #include -#include #include -#include -#include -#include -#include -#include + ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" namespace raft { -/** base exception class for the whole of raft */ -class exception : public std::exception { - public: - /** default ctor */ - explicit exception() noexcept : std::exception(), msg_() {} - - /** copy ctor */ - exception(const exception& src) noexcept - : std::exception(), msg_(src.what()) { - collect_call_stack(); - } - - /** ctor from an input message */ - explicit exception(const std::string _msg) noexcept - : std::exception(), msg_(std::move(_msg)) { - collect_call_stack(); - } - - /** get the message associated with this exception */ - const char* what() const noexcept override { return msg_.c_str(); } - - private: - /** message associated with this exception */ - std::string msg_; - - /** append call stack info to this exception's message for ease of debug */ - // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept { -#ifdef __GNUC__ - constexpr int kMaxStackDepth = 64; - void* stack[kMaxStackDepth]; // NOLINT - auto depth = backtrace(stack, kMaxStackDepth); - std::ostringstream oss; - oss << std::endl << "Obtained " << depth << " stack frames" << std::endl; - char** strings = backtrace_symbols(stack, depth); - if (strings == nullptr) { - oss << "But no stack trace could be found!" << std::endl; - msg_ += oss.str(); - return; - } - ///@todo: support for demangling of C++ symbol names - for (int i = 0; i < depth; ++i) { - oss << "#" << i << " in " << strings[i] << std::endl; - } - free(strings); - msg_ += oss.str(); -#endif // __GNUC__ - } -}; - -/** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ - } while (0) - -/** macro to check for a conditional and assert on failure */ -#define ASSERT(check, fmt, ...) \ - do { \ - if (!(check)) THROW(fmt, ##__VA_ARGS__); \ - } while (0) - /** check for cuda runtime API errors and assert accordingly */ #define CUDA_CHECK(call) \ do { \ diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 9424d28001..42fdbc9897 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -22,11 +22,83 @@ #include #include +#include +#include +#include #include #include namespace raft { +/** base exception class for the whole of raft */ +class exception : public std::exception { + public: + /** default ctor */ + explicit exception() noexcept : std::exception(), msg_() {} + + /** copy ctor */ + exception(const exception& src) noexcept + : std::exception(), msg_(src.what()) { + collect_call_stack(); + } + + /** ctor from an input message */ + explicit exception(const std::string _msg) noexcept + : std::exception(), msg_(std::move(_msg)) { + collect_call_stack(); + } + + /** get the message associated with this exception */ + const char* what() const noexcept override { return msg_.c_str(); } + + private: + /** message associated with this exception */ + std::string msg_; + + /** append call stack info to this exception's message for ease of debug */ + // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html + void collect_call_stack() noexcept { +#ifdef __GNUC__ + constexpr int kMaxStackDepth = 64; + void* stack[kMaxStackDepth]; // NOLINT + auto depth = backtrace(stack, kMaxStackDepth); + std::ostringstream oss; + oss << std::endl << "Obtained " << depth << " stack frames" << std::endl; + char** strings = backtrace_symbols(stack, depth); + if (strings == nullptr) { + oss << "But no stack trace could be found!" << std::endl; + msg_ += oss.str(); + return; + } + ///@todo: support for demangling of C++ symbol names + for (int i = 0; i < depth; ++i) { + oss << "#" << i << " in " << strings[i] << std::endl; + } + free(strings); + msg_ += oss.str(); +#endif // __GNUC__ + } +}; + +/** macro to throw a runtime error */ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf(errMsg, sizeof(errMsg), \ + "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ + } while (0) + +/** macro to check for a conditional and assert on failure */ +#define ASSERT(check, fmt, ...) \ + do { \ + if (!(check)) THROW(fmt, ##__VA_ARGS__); \ + } while (0) + /** * @brief Exception thrown when logical precondition is violated. * From 43d3f78f9200cd88a94242b6729b991885ef84af Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 12:31:26 -0500 Subject: [PATCH 083/189] Updated cublas depends. in Lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 698 ++++++++++---------------- 1 file changed, 260 insertions(+), 438 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index c375ebbb3e..938c4421ab 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,11 +25,10 @@ #include #include -#include #include -#include +#include #include - +#include // ========================================================= // Useful macros @@ -43,7 +42,7 @@ namespace raft { namespace { using namespace matrix; - + // ========================================================= // Helper functions // ========================================================= @@ -75,34 +74,28 @@ using namespace matrix; * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration(handle_t handle, - sparse_matrix_t const* A, - IndexType_ *iter, - IndexType_ maxIter, - ValueType_ shift, - ValueType_ tol, - bool reorthogonalize, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) -{ +int performLanczosIteration( + handle_t handle, sparse_matrix_t const *A, + IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, + bool reorthogonalize, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ negOne = -1; - const ValueType_ zero = 0; + const ValueType_ zero = 0; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - RAFT_EXPECT( A != nullptr, "Null matrix pointer."); + RAFT_EXPECT(A != nullptr, "Null matrix pointer."); IndexType_ n = A->nrows; - + // ------------------------------------------------------- // Compute second Lanczos vector // ------------------------------------------------------- @@ -111,22 +104,29 @@ int performLanczosIteration(handle_t handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, + stream)); auto alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, + beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), + 1, stream)); } // ------------------------------------------------------- @@ -138,118 +138,65 @@ int performLanczosIteration(handle_t handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, + lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(ValueType_), - cudaMemcpyDeviceToHost, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), + sizeof(ValueType_), cudaMemcpyDeviceToHost, + stream)); + + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, + alpha_host + (*iter - 1), stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, + beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; - + // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaDeviceSynchronize()); @@ -273,8 +220,7 @@ int performLanczosIteration(handle_t handle, * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) -{ +static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -284,7 +230,8 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) v[0] -= *Pv; // Normalize Householder vector - ValueType_ normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + ValueType_ normHouseholder = + std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -312,8 +259,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) -{ +static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { // Loop indices IndexType_ i, j; // Dot product between Householder vector and matrix row/column @@ -353,14 +299,10 @@ static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(IndexType_ n, - ValueType_ shift1, - ValueType_ shift2, - ValueType_ *alpha, - ValueType_ *beta, - ValueType_ *V, - ValueType_ *work) -{ +static int francisQRIteration(IndexType_ n, ValueType_ shift1, + ValueType_ shift2, ValueType_ *alpha, + ValueType_ *beta, ValueType_ *V, + ValueType_ *work) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -390,7 +332,8 @@ static int francisQRIteration(IndexType_ n, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, householderMatrix); + findHouseholder3(householder, &temp, + householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(ValueType_)); @@ -400,13 +343,14 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, + 0, work, n); memcpy(V, work, 3 * n * sizeof(ValueType_)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; @@ -421,16 +365,17 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, householderMatrix); + findHouseholder3(householder, beta + pos, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; @@ -443,30 +388,32 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, householderMatrix); + findHouseholder3(householder, beta + n - 4, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, householderMatrix); + findHouseholder3(householder, beta + n - 3, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } @@ -501,26 +448,23 @@ static int francisQRIteration(IndexType_ n, * Workspace. */ template -static int lanczosRestart(IndexType_ n, - IndexType_ iter, - IndexType_ iter_new, - ValueType_ *shiftUpper, - ValueType_ *shiftLower, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ V_host, - ValueType_ *__restrict__ work_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - bool smallest_eig) -{ +static int lanczosRestart( + handle_t handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, + ValueType_ *shiftUpper, ValueType_ *shiftLower, + ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, + ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, + bool smallest_eig) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants const ValueType_ zero = 0; - const ValueType_ one = 1; + const ValueType_ one = 1; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // Loop index IndexType_ i; @@ -578,52 +522,54 @@ static int lanczosRestart(IndexType_ n, // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = + cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration( - iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, + beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY( - cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - cublasgemv(false, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1); + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), + cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = + beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, + n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), 1, stream)); // Obtain new Lanczos vectors - cublasgemm( - false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, + &one, lanczosVecs_dev, n, V_dev, iter, &zero, + work_dev, n, stream)); - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + n * iter_new * sizeof(ValueType_), + cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice, stream)); - beta_host[iter_new - 1] = cublasnrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); - cublasscal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, + beta_host + iter_new - 1, stream)); + + auto h_beta = 1 / beta_host[iter_new - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, + lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } -} // anonym. namespace +} // namespace // ========================================================= // Eigensolver @@ -676,30 +622,20 @@ static int lanczosRestart(IndexType_ n, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t handle, - sparse_matrix_t const* A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *shift, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeSmallestEigenvectors( + handle_t handle, sparse_matrix_t const *A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, + IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; // Matrix dimension @@ -730,12 +666,14 @@ int computeSmallestEigenvectors(handle_t handle, // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); - + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -748,11 +686,12 @@ int computeSmallestEigenvectors(handle_t handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - cublasset_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -769,9 +708,13 @@ int computeSmallestEigenvectors(handle_t handle, CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector - CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); - cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); + CUDA_TRY( + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -782,18 +725,10 @@ int computeSmallestEigenvectors(handle_t handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0.0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + *shift = 0; + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -808,17 +743,9 @@ int computeSmallestEigenvectors(handle_t handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -835,18 +762,9 @@ int computeSmallestEigenvectors(handle_t handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - true); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -855,17 +773,9 @@ int computeSmallestEigenvectors(handle_t handle, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -876,14 +786,12 @@ int computeSmallestEigenvectors(handle_t handle, } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // Obtain desired eigenvalues by applying shift @@ -891,31 +799,20 @@ int computeSmallestEigenvectors(handle_t handle, for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter), - nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), + nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // for (int i = 0; i < nEigVecs; ++i) //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} - CUDA_TRY(cudaMemcpy( - work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(work_dev, Z_host, + (*effIter) * nEigVecs * sizeof(ValueType_), + cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - cublasgemm(false, - false, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit CUDA_TRY(curandDestroyGenerator(randGen)); @@ -959,24 +856,17 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t handle, - sparse_matrix_t const& A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 1234567, - cudaStream_t stream = 0) -{ +int computeSmallestEigenvectors( + handle_t handle, sparse_matrix_t const &A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 1234567) { // Matrix dimension IndexType_ n = A.nrows; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); @@ -987,32 +877,20 @@ int computeSmallestEigenvectors(handle_t handle, std::vector beta_host_v(restartIter); ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, + stream); // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - int status = computeSmallestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeSmallestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; @@ -1064,29 +942,20 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeLargestEigenvectors(handle_t handle, - sparse_matrix_t const* A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeLargestEigenvectors( + handle_t handle, sparse_matrix_t const *A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, + IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; // Matrix dimension @@ -1113,12 +982,15 @@ int computeLargestEigenvectors(handle_t handle, // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // ------------------------------------------------------- // Variable initialization // ------------------------------------------------------- @@ -1130,11 +1002,12 @@ int computeLargestEigenvectors(handle_t handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - cublasset_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // Compute largest eigenvalue @@ -1146,9 +1019,13 @@ int computeLargestEigenvectors(handle_t handle, CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector - CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); - cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); + CUDA_TRY( + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -1158,21 +1035,13 @@ int computeLargestEigenvectors(handle_t handle, // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; ValueType_ shift_val = 0.0; - ValueType_ *shift = &shift_val; + ValueType_ *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1189,18 +1058,9 @@ int computeLargestEigenvectors(handle_t handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - false); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1209,17 +1069,9 @@ int computeLargestEigenvectors(handle_t handle, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1232,14 +1084,12 @@ int computeLargestEigenvectors(handle_t handle, for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1266,35 +1116,24 @@ int computeLargestEigenvectors(handle_t handle, // Obtain desired eigenvalues by applying shift for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < top_eigenparis_idx_offset; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed - CUDA_TRY(cudaMemcpy(work_dev, - Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy( + work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), + (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - cublasgemm(false, - false, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit CUDA_TRY(curandDestroyGenerator(randGen)); @@ -1339,23 +1178,18 @@ int computeLargestEigenvectors(handle_t handle, */ template int computeLargestEigenvectors(handle_t handle, - sparse_matrix_t const& A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, + sparse_matrix_t const &A, + IndexType_ nEigVecs, IndexType_ maxIter, + IndexType_ restartIter, ValueType_ tol, + bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 123456, - cudaStream_t stream = 0) -{ + unsigned long long seed = 123456) { // Matrix dimension IndexType_ n = A.nrows; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1366,34 +1200,22 @@ int computeLargestEigenvectors(handle_t handle, std::vector beta_host_v(restartIter); ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, + stream); // Perform Lanczos method IndexType_ effIter; - int status = computeLargestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeLargestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; } - } // namespace raft From e0d6cf674d375bf1ff097bc0d9c5c33ecac7177c Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 15:24:13 -0500 Subject: [PATCH 084/189] Kmeans updates. --- cpp/include/raft/spectral/kmeans.hpp | 198 ++++++++++++++------------- 1 file changed, 100 insertions(+), 98 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 691df3e5ce..69ebbada91 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,9 @@ * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION -//#ifdef DEBUG - -#include "include/kmeans.hxx" - -#include -#include -#include +#include +#include +#include #include #include @@ -32,13 +27,10 @@ #include #include -#include "include/atomics.hxx" -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" - -using namespace nvgraph; +#include +#include +#include +#include // ========================================================= // Useful macros @@ -342,7 +334,8 @@ static __global__ void divideCentroids(IndexType_ d, * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(IndexType_ n, +static int chooseNewCentroid(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ rand, @@ -359,9 +352,12 @@ static int chooseNewCentroid(IndexType_ n, // Observation vector that is chosen as new centroid IndexType_ obsIndex; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Compute cumulative sum of distances - inclusive_scan( - device_pointer_cast(dists), device_pointer_cast(dists + n), device_pointer_cast(distsCumSum)); + thrust::inclusive_scan( + thrust::device_pointer_cast(dists),thrust::device_pointer_cast(dists + n),thrust::device_pointer_cast(distsCumSum)); cudaCheckError(); CHECK_CUDA( cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -370,16 +366,16 @@ static int chooseNewCentroid(IndexType_ n, // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) obsIndex = - (lower_bound( - device_pointer_cast(distsCumSum), device_pointer_cast(distsCumSum + n), distsSum * rand) - - device_pointer_cast(distsCumSum)); + (thrust::lower_bound( + thrust::device_pointer_cast(distsCumSum),thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + thrust::device_pointer_cast(distsCumSum)); cudaCheckError(); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); // Record new centroid position CHECK_CUDA(cudaMemcpyAsync( - centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); return 0; } @@ -406,14 +402,16 @@ static int chooseNewCentroid(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int initializeCentroids(IndexType_ n, +static int initializeCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ dists) + ValueType_* __restrict__ dists, + unsigned long long seed = 123456) { // ------------------------------------------------------- // Variable declarations @@ -426,9 +424,12 @@ static int initializeCentroids(IndexType_ n, dim3 blockDim_warp, gridDim_warp, gridDim_block; // Random number generator - thrust::default_random_engine rng(123456); + thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // ------------------------------------------------------- // Implementation // ------------------------------------------------------- @@ -445,40 +446,40 @@ static int initializeCentroids(IndexType_ n, gridDim_block.z = 1; // Assign observation vectors to code 0 - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); // Choose first centroid thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); cudaCheckError(); - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids)) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, dists); + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, dists); cudaCheckError() // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_))); - computeDistances<<>>( + CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); cudaCheckError(); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); cudaCheckError(); } // Compute cluster sizes - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); - computeClusterSizes<<>>(n, k, codes, clusterSizes); + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + computeClusterSizes<<>>(n, k, codes, clusterSizes); cudaCheckError(); return 0; @@ -508,7 +509,8 @@ static int initializeCentroids(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(IndexType_ n, +static int assignCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, @@ -521,26 +523,29 @@ static int assignCentroids(IndexType_ n, // CUDA grid dimensions dim3 blockDim, gridDim; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Compute distance between centroids and observation vectors - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_))); + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = min(k, 65535); gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - computeDistances<<>>(n, d, k, obs, centroids, dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); cudaCheckError(); // Find centroid closest to each observation vector - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); gridDim.y = 1; gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); + minDistances<<>>(n, k, dists, codes, clusterSizes); cudaCheckError(); // Compute residual sum of squares @@ -572,7 +577,8 @@ static int assignCentroids(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(IndexType_ n, +static int updateCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, @@ -582,7 +588,6 @@ static int updateCentroids(IndexType_ n, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int) { - using namespace thrust; // ------------------------------------------------------- // Variable declarations @@ -592,40 +597,55 @@ static int updateCentroids(IndexType_ n, const ValueType_ one = 1; const ValueType_ zero = 0; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // CUDA grid dimensions dim3 blockDim, gridDim; // Device memory - device_ptr obs_copy(work); - device_ptr codes_copy(work_int); - device_ptr rows(work_int + d * n); + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - Cublas::geam( - true, false, n, d, &one, obs, d, &zero, (ValueType_*)NULL, n, raw_pointer_cast(obs_copy), n); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (ValueType_*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry - sequence(rows, rows + d * n); + thrust::sequence(rows, rows + d * n); cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); + thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); cudaCheckError(); - gather(rows, rows + d * n, device_pointer_cast(codes), codes_copy); + thrust::gather(rows, rows + d * n,thrust::device_pointer_cast(codes), codes_copy); cudaCheckError(); // Row associated with each observation matrix entry - sequence(rows, rows + d * n); + thrust::sequence(rows, rows + d * n); cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); + thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); cudaCheckError(); // Sort and reduce to add observation vectors in same cluster - stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); + thrust::stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); cudaCheckError(); - reduce_by_key(rows, + thrust::reduce_by_key(rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored - device_pointer_cast(centroids)); + thrust::device_pointer_cast(centroids)); cudaCheckError(); // Divide sums by cluster size to get centroid matrix @@ -635,7 +655,7 @@ static int updateCentroids(IndexType_ n, gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); gridDim.z = 1; - divideCentroids<<>>(d, k, clusterSizes, centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); cudaCheckError(); return 0; @@ -643,7 +663,7 @@ static int updateCentroids(IndexType_ n, } // namespace -namespace nvgraph { +namespace raft { // ========================================================= // k-means algorithm @@ -682,7 +702,8 @@ namespace nvgraph { * @return NVGRAPH error flag. */ template -NVGRAPH_ERROR kmeans(IndexType_ n, +NVGRAPH_ERROR kmeans(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, @@ -694,7 +715,8 @@ NVGRAPH_ERROR kmeans(IndexType_ n, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int, ValueType_* residual_host, - IndexType_* iters_host) + IndexType_* iters_host, + unsigned long long seed = 123456) { // ------------------------------------------------------- // Variable declarations @@ -707,7 +729,7 @@ NVGRAPH_ERROR kmeans(IndexType_ n, ValueType_ residualPrev = 0; // Random number generator - thrust::default_random_engine rng(123456); + thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); // ------------------------------------------------------- @@ -736,11 +758,14 @@ NVGRAPH_ERROR kmeans(IndexType_ n, return NVGRAPH_ERR_BAD_PARAMETERS; } + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Trivial cases if (k == 1) { - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); - CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice)); - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim, gridDim; blockDim.x = WARP_SIZE; @@ -749,8 +774,8 @@ NVGRAPH_ERROR kmeans(IndexType_ n, gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = 1; gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, work); + CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, work); cudaCheckError(); *residual_host = thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); @@ -763,15 +788,16 @@ NVGRAPH_ERROR kmeans(IndexType_ n, thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); cudaCheckError(); - if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_))); + if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_), stream)); CHECK_CUDA( - cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return NVGRAPH_OK; } // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // k-means++ algorithm @@ -784,7 +810,7 @@ NVGRAPH_ERROR kmeans(IndexType_ n, // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation @@ -815,17 +841,17 @@ NVGRAPH_ERROR kmeans(IndexType_ n, } // Check for convergence - if (fabs(residualPrev - (*residual_host)) / n < tol) { + if (std::fabs(residualPrev - (*residual_host)) / n < tol) { ++iter; break; } } // Warning if k-means has failed to converge - if (fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; - return NVGRAPH_OK; + return 0; } /// Find clusters with k-means algorithm @@ -908,28 +934,4 @@ NVGRAPH_ERROR kmeans(IndexType_ n, &iters); } -// ========================================================= -// Explicit instantiations -// ========================================================= - -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - float tol, - int maxiter, - const float* __restrict__ obs, - int* __restrict__ codes, - float& residual, - int& iters); -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - double tol, - int maxiter, - const double* __restrict__ obs, - int* __restrict__ codes, - double& residual, - int& iters); -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION -//#endif //debug +} // namespace raft From 36561255fbcc3270e511cddcb6bc35b60021165d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 10 Jun 2020 16:45:35 -0400 Subject: [PATCH 085/189] update raft error classes to inherit raft::exception (instead of std::exception) --- cpp/include/raft/error.hpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 42fdbc9897..89c836f2c8 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -106,50 +106,50 @@ class exception : public std::exception { * RAFT_EXPECTS, RAFT_FAIL, CUML_EXPECTS, CUML_FAIL, CUGRAPH_EXPECTS, CUGRAPH_FAIL macros. * */ -struct logic_error : public std::logic_error { - explicit logic_error(char const* const message) : std::logic_error(message) {} +struct logic_error : public raft::exception { + explicit logic_error(char const* const message) : raft::exception(message) {} explicit logic_error(std::string const& message) - : std::logic_error(message) {} + : raft::exception(message) {} }; /** * @brief Exception thrown when a CUDA error is encountered. */ -struct cuda_error : public std::runtime_error { +struct cuda_error : public raft::exception { explicit cuda_error(char const* const message) - : std::runtime_error(message) {} + : raft::exception(message) {} explicit cuda_error(std::string const& message) - : std::runtime_error(message) {} + : raft::exception(message) {} }; /** * @brief Exception thrown when a cuRAND error is encountered. */ -struct curand_error : public std::runtime_error { +struct curand_error : public raft::exception { explicit curand_error(char const* const message) - : std::runtime_error(message) {} + : raft::exception(message) {} explicit curand_error(std::string const& message) - : std::runtime_error(message) {} + : raft::exception(message) {} }; /** * @brief Exception thrown when a cuSparse error is encountered. */ -struct cusparse_error : public std::runtime_error { +struct cusparse_error : public raft::exception { explicit cusparse_error(char const* const message) - : std::runtime_error(message) {} + : raft::exception(message) {} explicit cusparse_error(std::string const& message) - : std::runtime_error(message) {} + : raft::exception(message) {} }; /** * @brief Exception thrown when a NCCL error is encountered. */ -struct nccl_error : public std::runtime_error { +struct nccl_error : public raft::exception { explicit nccl_error(char const* const message) - : std::runtime_error(message) {} + : raft::exception(message) {} explicit nccl_error(std::string const& message) - : std::runtime_error(message) {} + : raft::exception(message) {} }; } // namespace raft From 0e62ceab09105258897ce2d5562fe2b7d992d9bb Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 10 Jun 2020 16:55:14 -0400 Subject: [PATCH 086/189] move macros out from the raft namespace --- cpp/include/raft/cudart_utils.h | 4 +-- cpp/include/raft/error.hpp | 38 ++++++++++----------- cpp/include/raft/linalg/cublas_wrappers.h | 6 ++-- cpp/include/raft/linalg/cusolver_wrappers.h | 6 ++-- cpp/include/raft/sparse/cusparse_wrappers.h | 6 ++-- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 5ae4bcbac2..2eea710897 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -26,8 +26,6 @@ ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" -namespace raft { - /** check for cuda runtime API errors and assert accordingly */ #define CUDA_CHECK(call) \ do { \ @@ -50,6 +48,8 @@ namespace raft { } \ } while (0) +namespace raft { + /** helper method to get max usable shared mem per block parameter */ inline int get_shared_memory_per_block() { int dev_id; diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 89c836f2c8..2a10854918 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -80,25 +80,6 @@ class exception : public std::exception { } }; -/** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ - } while (0) - -/** macro to check for a conditional and assert on failure */ -#define ASSERT(check, fmt, ...) \ - do { \ - if (!(check)) THROW(fmt, ##__VA_ARGS__); \ - } while (0) - /** * @brief Exception thrown when logical precondition is violated. * @@ -154,6 +135,25 @@ struct nccl_error : public raft::exception { } // namespace raft +/** macro to throw a runtime error */ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf(errMsg, sizeof(errMsg), \ + "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ + } while (0) + +/** macro to check for a conditional and assert on failure */ +#define ASSERT(check, fmt, ...) \ + do { \ + if (!(check)) THROW(fmt, ##__VA_ARGS__); \ + } while (0) + #define STRINGIFY_DETAIL(x) #x #define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index cd8a508a84..170221a844 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -22,9 +22,6 @@ #include #include -namespace raft { -namespace linalg { - #define _CUBLAS_ERR_TO_STR(err) \ case err: \ return #err @@ -66,6 +63,9 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { // } \ // } while (0) +namespace raft { +namespace linalg { + /** * @defgroup Axpy cublas ax+y operations * @{ diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 92ba1a2194..e5705ada5d 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -22,9 +22,6 @@ //#include #include -namespace raft { -namespace linalg { - #define _CUSOLVER_ERR_TO_STR(err) \ case err: \ return #err; @@ -66,6 +63,9 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { // } \ // } while (0) +namespace raft { +namespace linalg { + /** * @defgroup Getrf cusolver getrf operations * @{ diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1c63d2348b..ccb6622d5b 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -21,9 +21,6 @@ //#include #include -namespace raft { -namespace sparse { - #define _CUSPARSE_ERR_TO_STR(err) \ case err: \ return #err; @@ -67,6 +64,9 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { // } \ // } while (0) +namespace raft { +namespace sparse { + /** * @defgroup gthr cusparse gather methods * @{ From cca40eab5927d5662dda06f00f70281c0d88580a Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 17:18:40 -0500 Subject: [PATCH 087/189] Update on kmeans and cleanup. --- cpp/include/raft/spectral/kmeans.hpp | 478 ++++---- cpp/include/raft/spectral/lanczos.hpp | 10 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 191 ++- cpp/include/raft/spectral/spectral_matrix.hpp | 1044 ----------------- 4 files changed, 287 insertions(+), 1436 deletions(-) delete mode 100644 cpp/include/raft/spectral/spectral_matrix.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 69ebbada91..a9b1c1f049 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -71,13 +71,9 @@ namespace { * initialized to zero. */ template -static __global__ void computeDistances(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists) -{ +static __global__ void computeDistances( + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists) { // Loop index IndexType_ i; @@ -115,7 +111,8 @@ static __global__ void computeDistances(IndexType_ n, dist_private += utils::shfl_down(dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) + atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -150,12 +147,10 @@ static __global__ void computeDistances(IndexType_ n, * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(IndexType_ n, - IndexType_ k, +static __global__ void minDistances(IndexType_ n, IndexType_ k, ValueType_* __restrict__ dists, IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ + IndexType_* __restrict__ clusterSizes) { // Loop index IndexType_ i, j; @@ -174,8 +169,8 @@ static __global__ void minDistances(IndexType_ n, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -212,8 +207,7 @@ static __global__ void minDistances2(IndexType_ n, ValueType_* __restrict__ dists_old, const ValueType_* __restrict__ dists_new, IndexType_* __restrict__ codes_old, - IndexType_ code_new) -{ + IndexType_ code_new) { // Loop index IndexType_ i; @@ -250,11 +244,9 @@ static __global__ void minDistances2(IndexType_ n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes(IndexType_ n, - IndexType_ k, - const IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ +static __global__ void computeClusterSizes( + IndexType_ n, IndexType_ k, const IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) { IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -282,11 +274,9 @@ static __global__ void computeClusterSizes(IndexType_ n, * column is the mean position of a cluster). */ template -static __global__ void divideCentroids(IndexType_ d, - IndexType_ k, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids) -{ +static __global__ void divideCentroids( + IndexType_ d, IndexType_ k, const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids) { // Global indices IndexType_ gidx, gidy; @@ -300,7 +290,7 @@ static __global__ void divideCentroids(IndexType_ d, clusterSize_private = clusterSizes[gidy]; // Add vector entries to centroid matrix - // Vector entris are determined by global x-index + // vector entris are determined by global x-index gidx = threadIdx.x + blockIdx.x * blockDim.x; while (gidx < d) { centroids[IDX(gidx, gidy, d)] /= clusterSize_private; @@ -333,16 +323,13 @@ static __global__ void divideCentroids(IndexType_ d, * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ rand, + ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ rand, const ValueType_* __restrict__ obs, ValueType_* __restrict__ dists, - ValueType_* __restrict__ centroid) -{ + ValueType_* __restrict__ centroid) { using namespace thrust; // Cumulative sum of distances @@ -356,26 +343,28 @@ static int chooseNewCentroid(handle_t handle, auto stream = handle.get_stream(); // Compute cumulative sum of distances - thrust::inclusive_scan( - thrust::device_pointer_cast(dists),thrust::device_pointer_cast(dists + n),thrust::device_pointer_cast(distsCumSum)); - cudaCheckError(); - CHECK_CUDA( - cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); + thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + thrust::device_pointer_cast(distsCumSum)); + CUDA_CHECK_LAST(); + CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), + cudaMemcpyDeviceToHost)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) - obsIndex = - (thrust::lower_bound( - thrust::device_pointer_cast(distsCumSum),thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - thrust::device_pointer_cast(distsCumSum)); - cudaCheckError(); + obsIndex = (thrust::lower_bound( + thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + thrust::device_pointer_cast(distsCumSum)); + CUDA_CHECK_LAST(); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); // Record new centroid position - CHECK_CUDA(cudaMemcpyAsync( - centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), + d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + stream)); return 0; } @@ -401,18 +390,13 @@ static int chooseNewCentroid(handle_t handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template -static int initializeCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ centroids, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ dists, - unsigned long long seed = 123456) -{ +template +static int initializeCentroids( + handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -438,49 +422,54 @@ static int initializeCentroids(handle_t handle, blockDim_warp.x = WARP_SIZE; blockDim_warp.y = 1; blockDim_warp.z = BSIZE_DIV_WSIZE; - gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim_warp.y = 1; - gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim_warp.y = 1; + gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); gridDim_block.y = 1; gridDim_block.z = 1; // Assign observation vectors to code 0 - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); // Choose first centroid - thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - cudaCheckError(); - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids)) + thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), 1); + CUDA_CHECK_LAST(); + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, dists); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids, dists); cudaCheckError() // Choose remaining centroids - for (i = 1; i < k; ++i) - { + for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - cudaCheckError(); + CUDA_CHECK_LAST(); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); - cudaCheckError(); + minDistances2<<>>(n, dists, dists + n, + codes, i); + CUDA_CHECK_LAST(); } // Compute cluster sizes - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); - computeClusterSizes<<>>(n, k, codes, clusterSizes); - cudaCheckError(); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + computeClusterSizes<<>>(n, k, codes, + clusterSizes); + CUDA_CHECK_LAST(); return 0; } @@ -508,18 +497,15 @@ static int initializeCentroids(handle_t handle, * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, +template +static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) -{ + ValueType_* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -527,30 +513,33 @@ static int assignCentroids(handle_t handle, auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - computeDistances<<>>(n, d, k, obs, centroids, dists); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + computeDistances<<>>(n, d, k, obs, centroids, + dists); + CUDA_CHECK_LAST(); // Find centroid closest to each observation vector - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); - cudaCheckError(); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, + clusterSizes); + CUDA_CHECK_LAST(); // Compute residual sum of squares *residual_host = - thrust::reduce(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n)); return 0; } @@ -576,25 +565,21 @@ static int assignCentroids(handle_t handle, * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int updateCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, +template +static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const IndexType_* __restrict__ codes, const IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int) -{ - + IndexType_* __restrict__ work_int) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; auto cublas_h = handle.get_cublas_handle(); @@ -609,54 +594,48 @@ static int updateCentroids(handle_t handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (ValueType_*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, + d, &zero, (ValueType_*)NULL, n, + thrust::raw_pointer_cast(obs_copy), n, stream)); // Cluster assigned to each observation matrix entry - thrust::sequence(rows, rows + d * n); - cudaCheckError(); - thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); - cudaCheckError(); - thrust::gather(rows, rows + d * n,thrust::device_pointer_cast(codes), codes_copy); - cudaCheckError(); + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CUDA_CHECK_LAST(); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + make_constant_iterator(n), rows, + modulus()); + CUDA_CHECK_LAST(); + thrust::gather(thrust_exec_policy, rows, rows + d * n, + thrust::device_pointer_cast(codes), codes_copy); + CUDA_CHECK_LAST(); // Row associated with each observation matrix entry - thrust::sequence(rows, rows + d * n); - cudaCheckError(); - thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); - cudaCheckError(); + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CUDA_CHECK_LAST(); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + make_constant_iterator(n), rows, + divides()); + CUDA_CHECK_LAST(); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - cudaCheckError(); - thrust::reduce_by_key(rows, - rows + d * n, - obs_copy, - codes_copy, // Output to codes_copy is ignored - thrust::device_pointer_cast(centroids)); - cudaCheckError(); + thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + make_zip_iterator(make_tuple(obs_copy, rows))); + CUDA_CHECK_LAST(); + thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + codes_copy, // Output to codes_copy is ignored + thrust::device_pointer_cast(centroids)); + CUDA_CHECK_LAST(); // Divide sums by cluster size to get centroid matrix blockDim.x = WARP_SIZE; blockDim.y = BLOCK_SIZE / WARP_SIZE; blockDim.z = 1; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim.z = 1; - divideCentroids<<>>(d, k, clusterSizes, centroids); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim.z = 1; + divideCentroids<<>>(d, k, clusterSizes, + centroids); + CUDA_CHECK_LAST(); return 0; } @@ -699,25 +678,16 @@ namespace raft { * vectors and centroids). * @param iters_host (Output, host memory, 1 entry) Number of * k-means iterations. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR kmeans(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int, - ValueType_* residual_host, - IndexType_* iters_host, - unsigned long long seed = 123456) -{ +template +int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, + const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int, ValueType_* residual_host, + IndexType_* iters_host, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -736,63 +706,50 @@ NVGRAPH_ERROR kmeans(handle_t handle, // Initialization // ------------------------------------------------------- - // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Trivial cases if (k == 1) { - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); - CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), + cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim, gridDim; blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = 1; - gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, work); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = 1; + gridDim.z = + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, + work); + CUDA_CHECK_LAST(); *residual_host = - thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - cudaCheckError(); - return NVGRAPH_OK; + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), + thrust::device_pointer_cast(work + n)); + CUDA_CHECK_LAST(); + return 0; } if (n <= k) { - thrust::sequence(thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - cudaCheckError(); - thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); - cudaCheckError(); - - if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_), stream)); - CHECK_CUDA( - cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::device_pointer_cast(codes + n)); + CUDA_CHECK_LAST(); + thrust::fill_n(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), n, 1); + CUDA_CHECK_LAST(); + + if (n < k) + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, + (k - n) * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; - return NVGRAPH_OK; + return 0; } // Initialize cuBLAS @@ -804,40 +761,47 @@ NVGRAPH_ERROR kmeans(handle_t handle, // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(n, d, k, obs, centroids, codes, clusterSizes, work)) + if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); + IndexType_ emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid( - n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, + uniformDist(rng), obs, work, + centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - cudaCheckError(); + emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); + CUDA_CHECK_LAST(); } // Check for convergence @@ -848,7 +812,8 @@ NVGRAPH_ERROR kmeans(handle_t handle, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) + WARNING("k-means failed to converge"); *iters_host = iter; return 0; @@ -875,63 +840,34 @@ NVGRAPH_ERROR kmeans(handle_t handle, * @param residual On exit, residual sum of squares (sum of squares * of distances between observation vectors and centroids). * @param On exit, number of k-means iterations. - * @return NVGRAPH error flag + * @return error flag */ -template -NVGRAPH_ERROR kmeans(IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - ValueType_& residual, - IndexType_& iters) -{ +template +int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, + const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, + ValueType_& residual, IndexType_& iters, + unsigned long long seed = 123456) { + using namespace matrix; + // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } + RAFT_EXPECT(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECT(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECT(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECT(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECT(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory - // TODO: handle non-zero CUDA streams - cudaStream_t stream = 0; - Vector clusterSizes(k, stream); - Vector centroids(d * k, stream); - Vector work(n * max(k, d), stream); - Vector work_int(2 * d * n, stream); + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans(n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters); + return kmeans( + handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, + clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, + &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 938c4421ab..5a334c2c1a 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -880,9 +880,8 @@ int computeSmallestEigenvectors( ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, - stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method IndexType_ effIter; @@ -1203,9 +1202,8 @@ int computeLargestEigenvectors(handle_t handle, ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, - stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method IndexType_ effIter; diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index f3fb509e12..d8e497fe0f 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,15 +15,14 @@ */ #pragma once +#include // ? #include -#include // ? #include - -namespace raft{ +namespace raft { namespace matrix { -using size_type = int; // for now; TODO: move it in appropriate header +using size_type = int; // for now; TODO: move it in appropriate header // Vector "view"-like aggregate for linear algebra purposes // @@ -32,22 +31,16 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz): - buffer_(buffer), - size_(sz) - { - } + vector_view_t(value_type* buffer, size_type sz) + : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other): - buffer_(other.buffer_), - size_(other.size_) - { + vector_view_t(vector_view_t&& other) + : buffer_(other.buffer_), size_(other.size_) { other.buffer_ = nullptr; other.size_ = 0; } - vector_view_t& operator = (vector_view_t&& other) - { + vector_view_t& operator=(vector_view_t&& other) { buffer_ = other.buffer_; size_ = other.size_; @@ -64,109 +57,86 @@ class vector_t { value_type* buffer_; size_type size_; cudaStream_t stream_; -public: - - vector_t(handle_t const& raft_handle, size_type sz, cudaStream_t stream = 0): - handle_(raft_handle), - buffer_(static_cast(raft_handle.get_device_allocator()->allocate(sz*sizeof(value_type), stream))), - size_(sz), - stream_(stream) - { - } - ~vector_t(void) - { + public: + vector_t(handle_t const& raft_handle, size_type sz) + : handle_(raft_handle), + buffer_( + static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), + size_(sz), + stream_(raft_handle.get_stream()) {} + + ~vector_t(void) { handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); } - size_type size(void) const - { - return size_; - } - - value_type* raw(void) - { - return buffer_; - } + size_type size(void) const { return size_; } + + value_type* raw(void) { return buffer_; } }; - + template struct sparse_matrix_t { - sparse_matrix_t(index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nnz, - index_type const nrows) : - row_offsets_(row_offsets), - col_indices_(col_indices), - values_(values), - nrows_(nrows), - nnz_(nnz) - { - } + sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, + value_type const* values, index_type const nnz, + index_type const nrows) + : row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + nnz_(nnz) {} + + sparse_matrix_t( + GraphCSRView const& csr_view) + : row_offsets_(csr_view.offsets_), + col_indices_(csr_view.indices_), + values_(csr_view.edge_data_), + nrows_(csr_view.number_of_vertices_), + nnz_(csr_view.number_of_edges_) {} + + virtual ~sparse_matrix_t(void) = + default; // virtual because used as base for following matrix types - sparse_matrix_t(GraphCSRView const& csr_view): - row_offsets_(csr_view.offsets_), - col_indices_(csr_view.indices_), - values_(csr_view.edge_data_), - nrows_(csr_view.number_of_vertices_), - nnz_(csr_view.number_of_edges_) - { - } - - - virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types - // y = alpha*A*x + beta*y // - virtual void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const - { + virtual void mv(value_type alpha, value_type const* __restrict__ x, + value_type beta, value_type* __restrict__ y) const { //TODO: call cusparse::csrmv } - + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate - + index_type const* row_offsets_; index_type const* col_indices_; - value_type const* values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + value_type const* + values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... index_type const nrows_; index_type const nnz_; }; template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz, - cudaStream_t stream = 0) : - sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), - diagonal_(raft_handle, nrows, stream) - { + laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : sparse_matrix_t(row_offsets, col_indices, values, + nrows, nnz), + diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); //TODO: more work, here... } - laplacian_matrix_t(handle_t const& raft_handle, - GraphCSRView const& csr_view, - cudaStream_t stream = 0): - sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_, stream) - { - } + laplacian_matrix_t( + handle_t const& raft_handle, + GraphCSRView const& csr_view) + : sparse_matrix_t(csr_view), + diagonal_(raft_handle, csr_view.number_of_vertices_) {} // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const override - { + void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + value_type* __restrict__ y) const override { //TODO: call cusparse::csrmv } @@ -174,38 +144,29 @@ struct laplacian_matrix_t : sparse_matrix_t { }; template -struct modularity_matrix_t: laplacian_matrix_t -{ +struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz, - cudaStream_t stream = 0) : - laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) - { + index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : laplacian_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz) { auto* v = laplacian_matrix_t::diagonal_.raw(); //TODO: more work, here... } - modularity_matrix_t(handle_t const& raft_handle, - GraphCSRView const& csr_view, - cudaStream_t stream = 0): - laplacian_matrix_t(raft_handle, csr_view, stream) - { - } - + modularity_matrix_t( + handle_t const& raft_handle, + GraphCSRView const& csr_view) + : laplacian_matrix_t(raft_handle, csr_view) {} + // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const override - { + void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + value_type* __restrict__ y) const override { //TODO: call cusparse::csrmv } }; - -} // namespace matrix -} // namespace raft + +} // namespace matrix +} // namespace raft diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp deleted file mode 100644 index b9186329d3..0000000000 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ /dev/null @@ -1,1044 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -// #include -// #include -// #include -// #include -// #include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef DEBUG -#include -#include -#endif - -#include "error_temp.hpp" // TODO: replace w/ actual error handling to be brought in soon - -// CUDA block size -#define BLOCK_SIZE 1024 - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace raft { -namespace matrix { - void check_size(size_t sz) - { - RAFT_EXPECT( sz <= INT_MAX, "Vector larger than INT_MAX"); - } - template - void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec); - *res = thrust::reduce(dev_ptr, dev_ptr + n); - CUDA_CHECK_LAST(); - } - - template - void fill_raw_vec(ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec); - thrust::fill(dev_ptr, dev_ptr + n, value); - CUDA_CHECK_LAST(); - } - - template - void dump_raw_vec(ValueType_* vec, size_t n, int offset, cudaStream_t stream) - { -#ifdef DEBUG - thrust::device_ptr dev_ptr(vec); - std::cout< - __global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags) - { - int tidx = blockDim.x * blockIdx.x + threadIdx.x; - for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x) { - if (vec[r] != 0.0) - flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha) - else - flags[r] = 0; - } - } - template - __global__ void dmv0_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) - { - // y=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] = D[i] * x[i]; - } - template - __global__ void dmv1_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) - { - // y+=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] += D[i] * x[i]; - } - template - void copy_vec(ValueType_* vec1, size_t n, ValueType_* res, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec1); - thrust::device_ptr res_ptr(res); -#ifdef DEBUG - // COUT() << "copy "<< n << " elements" << std::endl; -#endif - thrust::copy_n(dev_ptr, n, res_ptr); - CUDA_CHECK_LAST(); - // dump_raw_vec (res, n, 0); - } - - template - void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream) - { - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - flag_zeroes_kernel<<>>(num_vertices, vec, flags); - CUDA_CHECK_LAST(); - } - - template - void dmv(size_t num_vertices, - ValueType_ alpha, - ValueType_* D, - ValueType_* x, - ValueType_ beta, - ValueType_* y, - cudaStream_t stream) - { - RAFT_EXPECT((alpha == 1.0) && ((beta == 0.0) || (beta == 1.0)), "Not implemented case of y = D*x"); - - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - if (beta == 0.0) - dmv0_kernel<<>>(D, x, y, n); - else if (beta == 1.0) - dmv1_kernel<<>>(D, x, y, n); - - CUDA_CHECK_LAST(); - } - - template - void set_connectivity(size_t n, - IndexType_ root, - ValueType_ self_loop_val, - ValueType_ unreachable_val, - ValueType_* res, - cudaStream_t stream) - { - fill_raw_vec(res, n, unreachable_val); - cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); - CUDA_CHECK_LAST(); - } - - - /*! A Vector contains a device vector of size |E| and type T - */ - template - class Vector { - public: - typedef ValueType_ ValueType; - - protected: - rmm::device_vector values; - - public: - /*! Construct an empty \p Vector. - */ - Vector(void) {} - ~Vector(void) {} - /*! Construct a \p Vector of size vertices. - * - * \param vertices The size of the Vector - */ - Vector(size_t vertices, cudaStream_t stream = 0) - : values(vertices) {} - - size_t get_size() const { return values.size(); } - size_t bytes() const { return values.size()*sizeof(ValueType);} - ValueType const *raw() const { return values.data().get(); } - ValueType *raw() { return values.data().get(); } - - void allocate(size_t n, cudaStream_t stream = 0) - { - values.resize(n);//TODO: delegate to outer alocator! - } - - void fill(ValueType val, cudaStream_t stream = 0) - { - fill_raw_vec(this->raw(), this->get_size(), val, stream); - } - - void copy(Vector &vec1, cudaStream_t stream = 0) - { - RAFT_EXPECT( (get_size() == 0 && vec1.get_size()>0) || (get_size() >= vec1.get_size()) ); - if (this->get_size() == 0 && vec1.get_size()>0) { - allocate(vec1.get_size(), stream); - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - } else if (this->get_size() == vec1.get_size()) - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - else // if (this->get_size() > vec1.get_size()) { - copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); - } - } - - ValueType nrm1(cudaStream_t stream = 0) { - ValueType res = 0; - nrm1_raw_vec(this->raw(), this->get_size(), &res, stream); - return res; - } - }; // class Vector - - /// Abstract matrix class - /** Derived classes must implement matrix-vector products. - */ - template - class Matrix { - public: - /// Number of rows - const IndexType_ m; - /// Number of columns - const IndexType_ n; - /// CUDA stream - cudaStream_t s; - - /// Constructor - /** @param _m Number of rows. - * @param _n Number of columns. - */ - Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){} - - /// Destructor - virtual ~Matrix() {} - - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s) = 0; - virtual void getCUDAStream(cudaStream_t *_s) = 0; - - /// Matrix-vector product - /** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output - * vector. - */ - virtual void mv(ValueType_ alpha, - const ValueType_ * __restrict__ x, - ValueType_ beta, - ValueType_ * __restrict__ y) const = 0; - - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0; - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const = 0; - virtual void reorder(IndexType_ *p) const = 0; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M) = 0; - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const = 0; - }; - - /// Sparse matrix class in CSR format - template - class CsrMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Whether matrix is stored in symmetric format - const bool sym; - /// Number of non-zero entries - const IndexType_ nnz; - /// Matrix properties - const cusparseMatDescr_t descrA; - /// Matrix entry values (device memory) - /*const*/ ValueType_ * csrValA; - /// Pointer to first entry in each row (device memory) - const IndexType_ * csrRowPtrA; - /// Column index of each matrix entry (device memory) - const IndexType_ * csrColIndA; - /// Analysis info (pointer to opaque CUSPARSE struct) - cusparseSolveAnalysisInfo_t info_l; - cusparseSolveAnalysisInfo_t info_u; - /// factored flag (originally set to false, then reset to true after factorization), - /// notice we only want to factor once - bool factored; - - public: - /// Constructor - CsrMatrix(bool _trans, bool _sym, - IndexType_ _m, IndexType_ _n, IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ * _csrValA, - const IndexType_ * _csrRowPtrA, - const IndexType_ * _csrColIndA); - - /// Destructor - virtual ~CsrMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Graph Laplacian matrix - template - class LaplacianMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - LaplacianMatrix(/*const*/ Matrix & _A); - - /// Destructor - virtual ~LaplacianMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Modularity matrix - template - class ModularityMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - IndexType_ nnz; - ValueType_ edge_sum; - - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - ModularityMatrix(/*const*/ Matrix & _A, IndexType_ _nnz); - - /// Destructor - virtual ~ModularityMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - -// ============================================= -// CUDA kernels -// ============================================= - -namespace { - -/// Apply diagonal matrix to vector -template -static __global__ void diagmv(IndexType_ n, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ *__restrict__ y) -{ - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - y[i] += alpha * D[i] * x[i]; - i += blockDim.x * gridDim.x; - } -} - -/// Apply diagonal matrix to a set of dense vectors (tall matrix) -template -static __global__ void diagmm(IndexType_ n, - IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) -{ - IndexType_ i, j, index; - - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < k; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { - index = i + j * n; - if (beta_is_zero) { - y[index] = alpha * D[i] * x[index]; - } else { - y[index] = alpha * D[i] * x[index] + beta * y[index]; - } - } - } -} -} // namespace - -// ============================================= -// CSR matrix class -// ============================================= - -/// Constructor for CSR matrix class -/** @param _transA Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _nnz Number of non-zero entries. - * @param _descrA Matrix properties. - * @param _csrValA (Input, device memory, _nnz entries) Matrix - * entry values. - * @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer - * to first entry in each row. - * @param _csrColIndA (Input, device memory, _nnz entries) Column - * index of each matrix entry. - */ -template -CsrMatrix::CsrMatrix(bool _trans, - bool _sym, - IndexType_ _m, - IndexType_ _n, - IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ *_csrValA, - const IndexType_ *_csrRowPtrA, - const IndexType_ *_csrColIndA) - : Matrix(_m, _n), - trans(_trans), - sym(_sym), - nnz(_nnz), - descrA(_descrA), - csrValA(_csrValA), - csrRowPtrA(_csrRowPtrA), - csrColIndA(_csrColIndA) -{ - RAFT_EXPECT(nnz >= 0, "invalid CSR matrix parameter (nnz<0)"); - Cusparse::set_pointer_mode_host(); -} - -/// Destructor for CSR matrix class -template -CsrMatrix::~CsrMatrix() -{ -} - -/// Get and Set CUDA stream -template -void CsrMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("CsrMatrix setCUDAStream stream=%p\n",this->s); - Cusparse::setStream(_s); -} -template -void CsrMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s)); -} -template -void CsrMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, - // descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m)); - Cusparse::csrmm(this->trans, - this->sym, - this->m, - k, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - this->n, - &beta, - y, - this->m); -} - -/// Color and Reorder -template -void CsrMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void CsrMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void CsrMatrix::prec_setup(Matrix *_M) -{ - // printf("CsrMatrix prec_setup dispacthed\n"); - if (!factored) { - // analyse lower triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // analyse upper triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u)); - // perform csrilu0 (should be slightly faster than csric0) - CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // set factored flag to true - factored = true; - } -} - -template -void CsrMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - // printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s); - - // preconditioning Mx=f (where M = L*U, threfore x=U\(L\f)) - // solve lower triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l, - fx, - this->m, - t, - this->m)); - // solve upper triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u, - t, - this->m, - fx, - this->m)); -} - -/// Matrix-vector product for CSR matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void CsrMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // TODO: consider using merge-path csrmv - Cusparse::csrmv(this->trans, - this->sym, - this->m, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - &beta, - y); -} - -template -ValueType_ CsrMatrix::getEdgeSum() const -{ - return 0.0; -} - -// ============================================= -// Laplacian matrix class -// ============================================= - -/// Constructor for Laplacian matrix class -/** @param A Adjacency matrix - */ -template -LaplacianMatrix::LaplacianMatrix( - /*const*/ Matrix &_A) - : Matrix(_A.m, _A.n), A(&_A) -{ - // Check that adjacency matrix is square - RAFT_EXPECT(_A.m == _A.n, "cannot construct Laplacian matrix from non-square adjacency matrix"); - // set CUDA stream - this->s = nullptr; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - - // Set preconditioning matrix pointer to nullptr - M = nullptr; -} - -/// Destructor for Laplacian matrix class -template -LaplacianMatrix::~LaplacianMatrix() -{ -} - -/// Get and Set CUDA stream -template -void LaplacianMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != nullptr) { M->setCUDAStream(_s); } -} -template -void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void LaplacianMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - if (beta == 0) - CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) - else if (beta != 1) - thrust::transform(thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y + this->n), - thrust::make_constant_iterator(beta), - thrust::device_pointer_cast(y), - thrust::multiplies()); - - // Apply diagonal matrix - dim3 gridDim, blockDim; - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - CUDA_CHECK_LAST(); - - // Apply adjacency matrix - A->mv(-alpha, x, 1, y); -} -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void LaplacianMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Apply diagonal matrix - ValueType_ one = (ValueType_)1.0; - this->dm(k, alpha, x, beta, y); - - // Apply adjacency matrix - A->mm(k, -alpha, x, one, y); -} - -template -void LaplacianMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - IndexType_ t = k * (this->n); - dim3 gridDim, blockDim; - - // setup launch parameters - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - - // Apply diagonal matrix - if (beta == 0.0) { - // set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner - // case) - CHECK_CUDA(cudaMemset(y, 0, t * sizeof(ValueType_))); - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } else { - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } - CUDA_CHECK_LAST(); -} - -/// Color and Reorder -template -void LaplacianMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void LaplacianMatrix::reorder(IndexType_ *p) const -{ -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void LaplacianMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != nullptr) { - // setup the preconditioning matrix M - M->prec_setup(nullptr); - } -} - -template -void LaplacianMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - if (M != nullptr) { - // preconditioning - M->prec_solve(k, alpha, fx, t); - } -} - -template -ValueType_ LaplacianMatrix::getEdgeSum() const -{ - return 0.0; -} -// ============================================= -// Modularity matrix class -// ============================================= - -/// Constructor for Modularity matrix class -/** @param A Adjacency matrix - */ -template -ModularityMatrix::ModularityMatrix( - /*const*/ Matrix &_A, IndexType_ _nnz) - : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) -{ - // Check that adjacency matrix is square - RAFT_EXPECT(_A.m == _A.n, "cannot construct Modularity matrix from non-square adjacency matrix"); - - // set CUDA stream - this->s = nullptr; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - // D.dump(0,this->n); - edge_sum = D.nrm1(); - - // Set preconditioning matrix pointer to nullptr - M = nullptr; -} - -/// Destructor for Modularity matrix class -template -ModularityMatrix::~ModularityMatrix() -{ -} - -/// Get and Set CUDA stream -template -void ModularityMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != nullptr) { M->setCUDAStream(_s); } -} - -template -void ModularityMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void ModularityMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - RAFT_EXPECT(alpha == 1 && beta == 0, "cannot construct Modularity matrix from non-square adjacency matrix"); - - // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, - // double *result)); - // y = A*x - A->mv(alpha, x, 0, y); - ValueType_ dot_res; - // gamma = d'*x - Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // y = y -(gamma/edge_sum)*d - Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); -} -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void ModularityMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -template -void ModularityMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -/// Color and Reorder -template -void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -template -void ModularityMatrix::reorder(IndexType_ *p) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void ModularityMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != nullptr) { - // setup the preconditioning matrix M - M->prec_setup(nullptr); - } -} - -template -void ModularityMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - RAFT_EXPECT(M == nullptr, "Functionality not currently supported in Modularity Matrix."); -} - -template -ValueType_ ModularityMatrix::getEdgeSum() const -{ - return edge_sum; -} - -} // namespace matrix -} // namespace raft From 55922cb277d244a6bd8e14fd1391bb48591ff51f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 00:31:30 -0400 Subject: [PATCH 088/189] remove CUML(GRAPH)_EXPECTS(FAIL) --- cpp/include/raft/error.hpp | 54 +------------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 2a10854918..38705e17d8 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -84,7 +84,7 @@ class exception : public std::exception { * @brief Exception thrown when logical precondition is violated. * * This exception should not be thrown directly and is instead thrown by the - * RAFT_EXPECTS, RAFT_FAIL, CUML_EXPECTS, CUML_FAIL, CUGRAPH_EXPECTS, CUGRAPH_FAIL macros. + * RAFT_EXPECTS and RAFT_FAIL macros. * */ struct logic_error : public raft::exception { @@ -183,58 +183,6 @@ struct nccl_error : public raft::exception { throw raft::logic_error("RAFT failure at: " __FILE__ \ ":" RAFT_STRINGIFY(__LINE__) ": " reason) -/** - * @brief Macro for checking (pre-)conditions that throws an exception when - * a condition is violated. - * - * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true - * @throw raft::logic_error if the condition evaluates to false. - */ -#define CUML_EXPECTS(cond, reason) \ - (!!(cond)) \ - ? static_cast(0) \ - : throw raft::logic_error("cuML failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) - -/** - * @brief Indicates that an erroneous code path has been taken. - * - * In host code, throws a `raft::logic_error`. - * - * @param[in] reason String literal description of the reason - */ -#define CUML_FAIL(reason) \ - throw raft::logic_error("cuML failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) - -/** - * @brief Macro for checking (pre-)conditions that throws an exception when - * a condition is violated. - * - * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true - * @throw raft::logic_error if the condition evaluates to false. - */ -#define CUGRAPH_EXPECTS(cond, reason) \ - (!!(cond)) \ - ? static_cast(0) \ - : throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) - -/** - * @brief Indicates that an erroneous code path has been taken. - * - * In host code, throws a `raft::logic_error`. - * - * @param[in] reason String literal description of the reason - */ -#define CUGRAPH_FAIL(reason) \ - throw raft::logic_error("cuGRAPH failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) - namespace raft { namespace detail { From acd5824e19765f1ff9d5933bd8d50b0e0aa99b60 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 00:59:23 -0400 Subject: [PATCH 089/189] update RAFT_EXPECTS and RAFT_FAIL --- cpp/include/raft/error.hpp | 39 ++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 38705e17d8..6eeeacdb92 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -148,40 +148,43 @@ struct nccl_error : public raft::exception { throw raft::exception(msg); \ } while (0) +// FIXME: Need to be replaced with RAFT_EXPECTS /** macro to check for a conditional and assert on failure */ #define ASSERT(check, fmt, ...) \ do { \ if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) - /** - * @brief Macro for checking (pre-)conditions that throws an exception when - * a condition is violated. + * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false * * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true + * @param[in] fmt String literal description of the reason that cond is expected to be true with + * optinal format tagas * @throw raft::logic_error if the condition evaluates to false. */ -#define RAFT_EXPECTS(cond, reason) \ - (!!(cond)) \ - ? static_cast(0) \ - : throw raft::logic_error("RAFT failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define RAFT_EXPECTS(cond, fmt, ...) \ + do { \ + if (!cond) { \ + std::string msg{}; \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), \ + "RAFT failure at file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ + throw raft::logic_error(msg); \ + } \ + } while (0) /** * @brief Indicates that an erroneous code path has been taken. * - * In host code, throws a `raft::logic_error`. - * - * @param[in] reason String literal description of the reason + * @param[in] fmt String literal description of the reason that this code path is erroneous with + * optinal format tagas + * @throw always throws raft::logic_error */ -#define RAFT_FAIL(reason) \ - throw raft::logic_error("RAFT failure at: " __FILE__ \ - ":" RAFT_STRINGIFY(__LINE__) ": " reason) +#define RAFT_FAIL(fmt, ...) RAFT_EXPECTS(false, fmt, ##__VA_ARGS__) namespace raft { namespace detail { From 4a48b57294947a1134ab1c3f2c11f04394003f6d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 01:17:24 -0400 Subject: [PATCH 090/189] compile error fix (namespace) --- cpp/include/raft/linalg/cublas_wrappers.h | 14 ++++++++++++-- cpp/include/raft/linalg/cusolver_wrappers.h | 14 ++++++++++++-- cpp/include/raft/sparse/cusparse_wrappers.h | 14 ++++++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 170221a844..84b3add031 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,6 +25,11 @@ #define _CUBLAS_ERR_TO_STR(err) \ case err: \ return #err + +namespace raft { +namespace linalg { +namespace detail { + inline const char *cublas_error_to_string(cublasStatus_t err) { switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); @@ -41,6 +46,11 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { return "CUBLAS_STATUS_UNKNOWN"; }; } + +}; // namespace detail +}; // namespace linalg +}; // namespace raft + #undef _CUBLAS_ERR_TO_STR /** check for cublas runtime API errors and assert accordingly */ @@ -49,7 +59,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { cublasStatus_t err = call; \ ASSERT(err == CUBLAS_STATUS_SUCCESS, \ "CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::cublas_error_to_string(err)); \ + raft::linalg::detail::cublas_error_to_string(err)); \ } while (0) ///@todo: enable this once we have logging enabled @@ -59,7 +69,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { // cublasStatus_t err = call; \ // if (err != CUBLAS_STATUS_SUCCESS) { \ // CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::linalg::cublas_error_to_string(err)); \ +// raft::linalg::detail::cublas_error_to_string(err)); \ // } \ // } while (0) diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index e5705ada5d..d7df86ac21 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -25,6 +25,11 @@ #define _CUSOLVER_ERR_TO_STR(err) \ case err: \ return #err; + +namespace raft { +namespace linalg { +namespace detail { + inline const char *cusolver_error_to_string(cusolverStatus_t err) { switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); @@ -41,6 +46,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { return "CUSOLVER_STATUS_UNKNOWN"; }; } + +}; // namespace detail +}; // namespace linalg +}; // namespace raft + #undef _CUSOLVER_ERR_TO_STR /** check for cusolver runtime API errors and assert accordingly */ @@ -49,7 +59,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { cusolverStatus_t err = call; \ ASSERT(err == CUSOLVER_STATUS_SUCCESS, \ "CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::cusolver_error_to_string(err)); \ + raft::linalg::detail::cusolver_error_to_string(err)); \ } while (0) ///@todo: enable this once logging is enabled @@ -59,7 +69,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { // cusolverStatus_t err = call; \ // if (err != CUSOLVER_STATUS_SUCCESS) { \ // CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::linalg::cusolver_error_to_string(err)); \ +// raft::linalg::detail::cusolver_error_to_string(err)); \ // } \ // } while (0) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index ccb6622d5b..a4a8173b88 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -24,6 +24,11 @@ #define _CUSPARSE_ERR_TO_STR(err) \ case err: \ return #err; + +namespace raft { +namespace sparse { +namespace detail { + inline const char* cusparse_error_to_string(cusparseStatus_t err) { #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(status); @@ -42,6 +47,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { }; #endif // CUDART_VERSION } + +}; // namespace detail +}; // namespace sparse +}; // namespace raft + #undef _CUSPARSE_ERR_TO_STR /** check for cusparse runtime API errors and assert accordingly */ @@ -50,7 +60,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { cusparseStatus_t err = call; \ ASSERT(err == CUSPARSE_STATUS_SUCCESS, \ "CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::cusparse_error_to_string(err)); \ + raft::sparse::detail::cusparse_error_to_string(err)); \ } while (0) ///@todo: enable this once logging is enabled @@ -60,7 +70,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { // cusparseStatus_t err = call; \ // if (err != CUSPARSE_STATUS_SUCCESS) { \ // CUML_LOG_ERROR("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::sparse::cusparse_error_to_string(err)); \ +// raft::sparse::detail::cusparse_error_to_string(err)); \ // } \ // } while (0) From 17091559892b347f58b22039cf5ac13977978574 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 12:48:31 -0500 Subject: [PATCH 091/189] Updating partition entry calls. --- cpp/include/raft/spectral/kmeans.hpp | 16 +-- cpp/include/raft/spectral/lanczos.hpp | 7 - cpp/include/raft/spectral/matrix_wrappers.hpp | 9 +- cpp/include/raft/spectral/partition.hpp | 130 ++++++------------ 4 files changed, 54 insertions(+), 108 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index a9b1c1f049..37c045b7f9 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include #include @@ -32,18 +33,15 @@ #include #include +namespace { + // ========================================================= -// Useful macros +// Useful grid settings // ========================================================= -#define BLOCK_SIZE 1024 -#define WARP_SIZE 32 -#define BSIZE_DIV_WSIZE (BLOCK_SIZE / WARP_SIZE) - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace { +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 5a334c2c1a..8f33eb7cc2 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -30,13 +30,6 @@ #include #include -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - namespace raft { namespace { diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index d8e497fe0f..3bca437c0c 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,10 +15,17 @@ */ #pragma once -#include // ? +#include #include #include +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + namespace raft { namespace matrix { diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index e4b9f50790..90b678a973 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include "include/partition.hxx" @@ -25,16 +26,10 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include -namespace nvgraph { +namespace raft { // ========================================================= // Useful macros @@ -153,21 +148,24 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ parts, - weight_t *eigVals, - weight_t *eigVecs) +template +int partition(handle_t handle, + ThrustExePolicy thrust_exec_policy, + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ parts, + weight_t *eigVals, + weight_t *eigVecs) { cudaStream_t stream = 0; @@ -190,19 +188,11 @@ NVGRAPH_ERROR partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); + sparse_matrix_t A{graph}; + laplacian_matrix_t L{handle, graph}; // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeSmallestEigenvectors(L, + RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, restartIter_lanczos, @@ -260,17 +250,17 @@ NVGRAPH_ERROR partition( // eigVecs.dump(0, nEigVecs*n); // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nParts, - tol_kmeans, - maxIter_kmeans, - eigVecs, - parts, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; + RAFT_TRY(kmeans(n, + nEigVecs, + nParts, + tol_kmeans, + maxIter_kmeans, + eigVecs, + parts, + residual_kmeans, + iters_kmeans)); + + return 0; } // ========================================================= @@ -307,10 +297,10 @@ struct equal_to_i_op { * assignments. * @param edgeCut On exit, weight of edges cut by partition. * @param cost On exit, partition cost function. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR analyzePartition( +int analyzePartition( cugraph::experimental::GraphCSRView const &graph, vertex_t nParts, const vertex_t *__restrict__ parts, @@ -376,49 +366,7 @@ NVGRAPH_ERROR analyzePartition( } // Clean up and return - return NVGRAPH_OK; + return 0; } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs); - -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs); - -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - float &edgeCut, - float &cost); -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - double &edgeCut, - double &cost); - -} // namespace nvgraph +} // namespace raft From aa7d8ec68819f4241b7682cbd19d225b048039ca Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 13:19:37 -0500 Subject: [PATCH 092/189] Updates on partition and matrix wrappers. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 162 +++++++----------- 2 files changed, 66 insertions(+), 100 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 3bca437c0c..04fd8cc185 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -144,7 +144,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv + //TODO: call cusparse::csrmv ... and more } vector_t diagonal_; @@ -171,7 +171,7 @@ struct modularity_matrix_t : laplacian_matrix_t { // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv + //TODO: call cusparse::csrmv ... and more } }; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 90b678a973..8dfa38d9a3 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -26,8 +26,8 @@ #include #include -#include #include +#include namespace raft { @@ -39,19 +39,21 @@ namespace raft { #define IDX(i, j, lda) ((i) + (j) * (lda)) template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_ *obs) { IndexType_ i, j, k, index, mm; ValueType_ alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x alpha = 0.0; // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -76,17 +78,17 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ * // scale by alpha alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -IndexType_ next_pow2(IndexType_ n) -{ +IndexType_ next_pow2(IndexType_ n) { IndexType_ v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -100,8 +102,7 @@ IndexType_ next_pow2(IndexType_ n) } template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { IndexType_ p2m; dim3 nthreads, nblocks; @@ -111,9 +112,9 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) nthreads.x = max(2, min(p2m, 32)); nthreads.y = 256 / nthreads.x; nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; // printf("m=%d(%d),n=%d,obs=%p, // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); @@ -150,28 +151,21 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * performed. * @return error flag. */ -template -int partition(handle_t handle, - ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ parts, - weight_t *eigVals, - weight_t *eigVecs) -{ - cudaStream_t stream = 0; - +int partition( + handle_t handle, ThrustExePolicy thrust_exec_policy, + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, vertex_t nEigVecs, int maxIter_lanczos, + int restartIter_lanczos, weight_t tol_lanczos, int maxIter_kmeans, + weight_t tol_kmeans, vertex_t *__restrict__ parts, weight_t *eigVals, + weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + int iters_lanczos; int iters_kmeans; @@ -192,22 +186,17 @@ int partition(handle_t handle, laplacian_matrix_t L{handle, graph}; // Compute smallest eigenvalues and eigenvectors - RAFT_TRY(computeSmallestEigenvectors(L, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); + RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, + restartIter_lanczos, tol_lanczos, false, + iters_lanczos, eigVals, eigVecs)); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = + thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); cudaCheckError(); mean /= n; thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), @@ -216,7 +205,8 @@ int partition(handle_t handle, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / + std::sqrt(static_cast(n)); thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(std), @@ -228,38 +218,22 @@ int partition(handle_t handle, // Transpose eigenvector matrix // TODO: in-place transpose { - Vector work(nEigVecs * n, stream); + vector_t work(handle, nEigVecs * n); Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + Cublas::geam(true, false, nEigVecs, n, &one, eigVecs, n, &zero, + (weight_t *)NULL, nEigVecs, work.raw(), nEigVecs); + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice)); } // Clean up // eigVecs.dump(0, nEigVecs*n); // Find partition with k-means clustering - RAFT_TRY(kmeans(n, - nEigVecs, - nParts, - tol_kmeans, - maxIter_kmeans, - eigVecs, - parts, - residual_kmeans, - iters_kmeans)); - + RAFT_TRY(kmeans(n, nEigVecs, nParts, tol_kmeans, maxIter_kmeans, eigVecs, + parts, residual_kmeans, iters_kmeans)); + return 0; } @@ -278,9 +252,9 @@ struct equal_to_i_op { public: equal_to_i_op(IndexType_ _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; } }; } // namespace @@ -301,51 +275,43 @@ struct equal_to_i_op { */ template int analyzePartition( + handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - const vertex_t *__restrict__ parts, - weight_t &edgeCut, - weight_t &cost) -{ - cudaStream_t stream = 0; - + vertex_t nParts, const vertex_t *__restrict__ parts, weight_t &edgeCut, + weight_t &cost) { edge_t i; edge_t n = graph.number_of_vertices; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + weight_t partEdgesCut, partSize; // Device memory - Vector part_i(n, stream); - Vector Lx(n, stream); + vector_t part_i(handle, n); + vector_t Lx(handle, n); // Initialize cuBLAS Cublas::set_pointer_mode_host(); // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); + sparse_matrix_t A{graph}; + laplacian_matrix_t L{handle, graph}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nParts; ++i) { // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); cudaCheckError(); // Compute size of ith partition From 059f1ec28b6cd350009b899c9fde36f9562ee7a0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 01:29:02 -0400 Subject: [PATCH 093/189] minor fixes to RAFT_EXPECTS(FAIL) --- cpp/include/raft/error.hpp | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 6eeeacdb92..644cc0f855 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -135,6 +135,7 @@ struct nccl_error : public raft::exception { } // namespace raft +// FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ #define THROW(fmt, ...) \ do { \ @@ -155,6 +156,17 @@ struct nccl_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), "RAFT failure at %s", __FILE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ + } while(0) + /** * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false * @@ -163,18 +175,13 @@ struct nccl_error : public raft::exception { * optinal format tagas * @throw raft::logic_error if the condition evaluates to false. */ -#define RAFT_EXPECTS(cond, fmt, ...) \ - do { \ - if (!cond) { \ - std::string msg{}; \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), \ - "RAFT failure at file=%s line=%d: ", __FILE__, __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ - throw raft::logic_error(msg); \ - } \ +#define RAFT_EXPECTS(cond, fmt, ...) \ + do { \ + if (!cond) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \ + throw raft::logic_error(msg); \ + } \ } while (0) /** @@ -184,7 +191,12 @@ struct nccl_error : public raft::exception { * optinal format tagas * @throw always throws raft::logic_error */ -#define RAFT_FAIL(fmt, ...) RAFT_EXPECTS(false, fmt, ##__VA_ARGS__) +#define RAFT_FAIL(fmt, ...) \ + do { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \ + throw raft::logic_error(msg); \ + } while (0) namespace raft { namespace detail { From 192000c7a35c3536cb489ff93261cf3d178362c7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 15:58:42 -0500 Subject: [PATCH 094/189] More partition clan-up. Added sm_utils. --- cpp/include/raft/spectral/kmeans.hpp | 10 +- cpp/include/raft/spectral/lanczos.hpp | 1 + cpp/include/raft/spectral/partition.hpp | 67 +++--- cpp/include/raft/spectral/sm_utils.hpp | 297 ++++++++++++++++++++++++ 4 files changed, 344 insertions(+), 31 deletions(-) create mode 100644 cpp/include/raft/spectral/sm_utils.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 37c045b7f9..f57a4c1be5 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -32,9 +32,12 @@ #include #include #include +#include namespace { +using namespace raft; +using namespace raft::linalg; // ========================================================= // Useful grid settings // ========================================================= @@ -328,8 +331,6 @@ static int chooseNewCentroid(handle_t handle, const ValueType_* __restrict__ obs, ValueType_* __restrict__ dists, ValueType_* __restrict__ centroid) { - using namespace thrust; - // Cumulative sum of distances ValueType_* distsCumSum = dists + n; // Residual sum of squares @@ -751,8 +752,9 @@ int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, } // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // k-means++ algorithm diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8f33eb7cc2..54818e1766 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -35,6 +35,7 @@ namespace raft { namespace { using namespace matrix; +using namespace linalg; // ========================================================= // Helper functions diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 8dfa38d9a3..156be656bd 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -28,15 +28,12 @@ #include #include +#include namespace raft { -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) +using namespace matrix; +using namespace linalg; template static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, @@ -120,7 +117,7 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { // launch scaling kernel (scale each column of obs by its norm) scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); + CUDA_CHECK_LAST(); return cudaSuccess; } @@ -152,7 +149,8 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template + typename ThrustExePolicy, typename EigenSolver = LanczosSolver, + typename ClusterSolver = KmeansSolver> int partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, @@ -194,37 +192,47 @@ int partition( for (i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = - thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(mean), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / - std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(std), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::divides()); - cudaCheckError(); + CUDA_CHECK_LAST(); } // Transpose eigenvector matrix // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, false, nEigVecs, n, &one, eigVecs, n, &zero, - (weight_t *)NULL, nEigVecs, work.raw(), nEigVecs); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t *)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, stream)); } // Clean up @@ -292,7 +300,8 @@ int analyzePartition( vector_t Lx(handle, n); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian sparse_matrix_t A{graph}; @@ -305,17 +314,20 @@ int analyzePartition( // Iterate through partitions for (i = 0; i < nParts; ++i) { // Construct indicator vector for ith partition - thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple( + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(parts), thrust::device_pointer_cast(part_i.raw()))), thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(parts + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(i)); - cudaCheckError(); + CUDA_CHECK_LAST(); // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &partSize, stream)); + partSize = round(partSize); if (partSize < 0.5) { WARNING("empty partition"); @@ -324,7 +336,8 @@ int analyzePartition( // Compute number of edges cut by ith partition L.mv(1, part_i.raw(), 0, Lx.raw()); - Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut); + CUBLAS_CHECK(cublasdot(cublas_h, n, Lx.raw(), 1, part_i.raw(), 1, + &partEdgesCut, stream)); // Record results cost += partEdgesCut / partSize; diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp new file mode 100644 index 0000000000..25d6e2e358 --- /dev/null +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#define DEFAULT_MASK 0xffffffff + +#define USE_CG 1 +//(__CUDACC_VER__ >= 80500) + +namespace raft { +namespace utils { +static __device__ __forceinline__ int lane_id() { + int id; + asm("mov.u32 %0, %%laneid;" : "=r"(id)); + return id; +} + +static __device__ __forceinline__ int lane_mask_lt() { + int mask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int lane_mask_le() { + int mask; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; } + +static __device__ __forceinline__ unsigned int ballot(int p, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __ballot_sync(mask, p); +#else + return __ballot(p); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0.0f; +#endif +} + +/// Warp shuffle down function +/** Warp shuffle functions on 64-bit floating point values are not + * natively implemented as of Compute Capability 5.0. This + * implementation has been copied from + * (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler). + * Once this is natively implemented, this function can be replaced + * by __shfl_down. + * + */ +static __device__ __forceinline__ double shfl(double r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl(long long r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_down(int r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_down(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_down(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_down(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +// specifically for triangles counting +static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(mask, a.x, offset, bound); + a.y = __shfl_down(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_up(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_up(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_up(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} +} // namespace utils + +} // namespace raft From 125911c4e68776197ef2ffae647666d538fde349 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 17:00:01 -0400 Subject: [PATCH 095/189] move error check macros from error.hpp to relevant headers --- cpp/include/raft/comms/std_comms.hpp | 41 +++- cpp/include/raft/cudart_utils.h | 62 +++++- cpp/include/raft/error.hpp | 229 +------------------- cpp/include/raft/linalg/cublas_wrappers.h | 63 ++++-- cpp/include/raft/linalg/cusolver_wrappers.h | 61 ++++-- cpp/include/raft/sparse/cusparse_wrappers.h | 64 ++++-- 6 files changed, 236 insertions(+), 284 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 1ba7552f9c..4aa3f20772 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -44,13 +44,42 @@ #include #include +#include -#define NCCL_CHECK(call) \ - do { \ - ncclResult_t status = call; \ - ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } while (0) +namespace raft { + +/** + * @brief Exception thrown when a NCCL error is encountered. + */ +struct nccl_error : public raft::exception { + explicit nccl_error(char const* const message) + : raft::exception(message) {} + explicit nccl_error(std::string const& message) + : raft::exception(message) {} +}; + +}; // namespace raft + +/** + * @brief Error checking macro for NCCL runtime API functions. + * + * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an + * exception detailing the NCCL error that occurred + */ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, ncclGetErrorString(status)); \ + throw raft::nccl_error(msg); \ + } \ + } while (0); + +/** FIXME: temporary alias for cuML compatibility */ +#define NCCL_CHECK(call) NCCL_TRY(call) #define NCCL_CHECK_NO_THROW(call) \ do { \ diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 2eea710897..93cf87b8cb 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -26,13 +26,63 @@ ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" -/** check for cuda runtime API errors and assert accordingly */ -#define CUDA_CHECK(call) \ +namespace raft { + +/** + * @brief Exception thrown when a CUDA error is encountered. + */ +struct cuda_error : public raft::exception { + explicit cuda_error(char const* const message) + : raft::exception(message) {} + explicit cuda_error(std::string const& message) + : raft::exception(message) {} +}; + +} + +/** + * @brief Error checking macro for CUDA runtime API functions. + * + * Invokes a CUDA runtime API function call, if the call does not return + * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an + * exception detailing the CUDA error that occurred + * + */ +#define CUDA_TRY(call) \ do { \ - cudaError_t status = call; \ - ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \ - cudaGetErrorString(status)); \ - } while (0) + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", \ + #call, cudaGetErrorName(status), cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ + } while(0) + +/** + * @brief Debug macro to check for CUDA errors + * + * In a non-release build, this macro will synchronize the specified stream + * before error checking. In both release and non-release builds, this macro + * checks for any pending CUDA errors from previous calls. If an error is + * reported, an exception is thrown detailing the CUDA error that occurred. + * + * The intent of this macro is to provide a mechanism for synchronous and + * deterministic execution for debugging asynchronous CUDA execution. It should + * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an + * asynchronous kernel launch. + * + */ +#ifndef NDEBUG +#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); +#else +#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); +#endif + +/** FIXME: temporary alias for cuML compatibility */ +#define CUDA_CHECK(call) CUDA_TRY(call) ///@todo: enable this only after we have added logging support in raft // /** diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 644cc0f855..480805f35f 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -89,48 +89,7 @@ class exception : public std::exception { */ struct logic_error : public raft::exception { explicit logic_error(char const* const message) : raft::exception(message) {} - explicit logic_error(std::string const& message) - : raft::exception(message) {} -}; - -/** - * @brief Exception thrown when a CUDA error is encountered. - */ -struct cuda_error : public raft::exception { - explicit cuda_error(char const* const message) - : raft::exception(message) {} - explicit cuda_error(std::string const& message) - : raft::exception(message) {} -}; - -/** - * @brief Exception thrown when a cuRAND error is encountered. - */ -struct curand_error : public raft::exception { - explicit curand_error(char const* const message) - : raft::exception(message) {} - explicit curand_error(std::string const& message) - : raft::exception(message) {} -}; - -/** - * @brief Exception thrown when a cuSparse error is encountered. - */ -struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) - : raft::exception(message) {} - explicit cusparse_error(std::string const& message) - : raft::exception(message) {} -}; - -/** - * @brief Exception thrown when a NCCL error is encountered. - */ -struct nccl_error : public raft::exception { - explicit nccl_error(char const* const message) - : raft::exception(message) {} - explicit nccl_error(std::string const& message) - : raft::exception(message) {} + explicit logic_error(std::string const& message) : raft::exception(message) {} }; } // namespace raft @@ -156,16 +115,17 @@ struct nccl_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), "RAFT failure at %s", __FILE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ - } while(0) +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), "RAFT failure at %s", __FILE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ + __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ + } while (0) /** * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false @@ -197,168 +157,3 @@ struct nccl_error : public raft::exception { SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \ throw raft::logic_error(msg); \ } while (0) - -namespace raft { -namespace detail { - -inline void throw_cuda_error(cudaError_t error, const char* file, - unsigned int line) { - throw raft::cuda_error( - std::string{"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); -} - -inline void throw_nccl_error(ncclResult_t error, const char* file, - unsigned int line) { - throw raft::nccl_error( - std::string{"NCCL error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - ncclGetErrorString(error)}); -} - -#define _CURAND_ERR_TO_STR(err) \ - case err: \ - return #err; -inline auto curand_error_to_string(curandStatus_t err) -> const char* { - switch (err) { - _CURAND_ERR_TO_STR(CURAND_STATUS_SUCCESS); - _CURAND_ERR_TO_STR(CURAND_STATUS_VERSION_MISMATCH); - _CURAND_ERR_TO_STR(CURAND_STATUS_NOT_INITIALIZED); - _CURAND_ERR_TO_STR(CURAND_STATUS_ALLOCATION_FAILED); - _CURAND_ERR_TO_STR(CURAND_STATUS_TYPE_ERROR); - _CURAND_ERR_TO_STR(CURAND_STATUS_OUT_OF_RANGE); - _CURAND_ERR_TO_STR(CURAND_STATUS_LENGTH_NOT_MULTIPLE); - _CURAND_ERR_TO_STR(CURAND_STATUS_DOUBLE_PRECISION_REQUIRED); - _CURAND_ERR_TO_STR(CURAND_STATUS_LAUNCH_FAILURE); - _CURAND_ERR_TO_STR(CURAND_STATUS_PREEXISTING_FAILURE); - _CURAND_ERR_TO_STR(CURAND_STATUS_INITIALIZATION_FAILED); - _CURAND_ERR_TO_STR(CURAND_STATUS_ARCH_MISMATCH); - _CURAND_ERR_TO_STR(CURAND_STATUS_INTERNAL_ERROR); - default: - return "CURAND_STATUS_UNKNOWN"; - }; -} -#undef _CURAND_ERR_TO_STR - -inline void throw_curand_error(curandStatus_t error, const char* file, - unsigned int line) { - throw raft::curand_error( - std::string{"cuRAND error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - curand_error_to_string(error)}); -} - -// FIXME: unnecessary once CUDA 10.1+ becomes the minimum supported version -#define _CUSPARSE_ERR_TO_STR(err) \ - case err: \ - return #err; -inline auto cusparse_error_to_string(cusparseStatus_t err) -> const char* { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 - return cusparseGetErrorString(status); -#else // CUDART_VERSION - switch (err) { - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); - _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: - return "CUSPARSE_STATUS_UNKNOWN"; - }; -#endif // CUDART_VERSION -} -#undef _CUSPARSE_ERR_TO_STR - -inline void throw_cusparse_error(cusparseStatus_t error, const char* file, - unsigned int line) { - throw raft::cusparse_error( - std::string{"cuSparse error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cusparse_error_to_string(error)}); -} - -} // namespace detail -} // namespace raft - -/** - * @brief Error checking macro for CUDA runtime API functions. - * - * Invokes a CUDA runtime API function call, if the call does not return - * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an - * exception detailing the CUDA error that occurred - * - */ -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = (call); \ - if (cudaSuccess != status) { \ - cudaGetLastError(); \ - raft::detail::throw_cuda_error(status, __FILE__, __LINE__); \ - } \ - } while (0); - -/** - * @brief Debug macro to check for CUDA errors - * - * In a non-release build, this macro will synchronize the specified stream - * before error checking. In both release and non-release builds, this macro - * checks for any pending CUDA errors from previous calls. If an error is - * reported, an exception is thrown detailing the CUDA error that occurred. - * - * The intent of this macro is to provide a mechanism for synchronous and - * deterministic execution for debugging asynchronous CUDA execution. It should - * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an - * asynchronous kernel launch. - * - */ -#ifndef NDEBUG -#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); -#else -#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); -#endif - -/** - * @brief Error checking macro for cuRAND runtime API functions. - * - * Invokes a cuRAND runtime API function call, if the call does not return - * CURAND_STATUS_SUCCESS, throws an exception detailing the cuRAND error that occurred - */ -#define CURAND_TRY(call) \ - do { \ - curandStatus_t const status = (call); \ - if (CURAND_STATUS_SUCCESS != status) { \ - raft::detail::throw_curand_error(status, __FILE__, __LINE__); \ - } \ - } while (0); - -/** - * @brief Error checking macro for cuSparse runtime API functions. - * - * Invokes a cuSparse runtime API function call, if the call does not return - * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred - */ -#define CUSPARSE_TRY(call) \ - do { \ - cusparseStatus_t const status = (call); \ - if (CUSPARSE_STATUS_SUCCESS != status) { \ - raft::detail::throw_cusparse_error(status, __FILE__, __LINE__); \ - } \ - } while (0); - -/** - * @brief Error checking macro for NCCL runtime API functions. - * - * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an - * exception detailing the NCCL error that occurred - */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - raft::detail::throw_nccl_error(status, __FILE__, __LINE__); \ - } \ - } while (0); diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 84b3add031..c8d51dca57 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -16,10 +16,12 @@ #pragma once +#include + #include ///@todo: enable this once we have logger enabled //#include -#include + #include #define _CUBLAS_ERR_TO_STR(err) \ @@ -27,6 +29,17 @@ return #err namespace raft { + +/** + * @brief Exception thrown when a cuBLAS error is encountered. + */ +struct cublas_error : public raft::exception { + explicit cublas_error(char const* const message) + : raft::exception(message) {} + explicit cublas_error(std::string const& message) + : raft::exception(message) {} +}; + namespace linalg { namespace detail { @@ -53,25 +66,39 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { #undef _CUBLAS_ERR_TO_STR -/** check for cublas runtime API errors and assert accordingly */ -#define CUBLAS_CHECK(call) \ - do { \ - cublasStatus_t err = call; \ - ASSERT(err == CUBLAS_STATUS_SUCCESS, \ - "CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::detail::cublas_error_to_string(err)); \ - } while (0) +/** + * @brief Error checking macro for cuBLAS runtime API functions. + * + * Invokes a cuBLAS runtime API function call, if the call does not return + * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred + */ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ + } while(0) + +/** FIXME: temporary alias for cuML compatibility */ +#define CUBLAS_CHECK(call) CUBLAS_TRY(call) ///@todo: enable this once we have logging enabled -// /** check for cublas runtime API errors but do not assert */ -// #define CUBLAS_CHECK_NO_THROW(call) \ -// do { \ -// cublasStatus_t err = call; \ -// if (err != CUBLAS_STATUS_SUCCESS) { \ -// CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::linalg::detail::cublas_error_to_string(err)); \ -// } \ -// } while (0) +#if 0 +/** check for cublas runtime API errors but do not assert */ +define CUBLAS_CHECK_NO_THROW(call) \ + do { \ + cublasStatus_t err = call; \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ + raft::linalg::detail::cublas_error_to_string(err)); \ + } \ + } while (0) +#endif namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index d7df86ac21..da5455444b 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -27,6 +27,17 @@ return #err; namespace raft { + +/** + * @brief Exception thrown when a cuSOLVER error is encountered. + */ +struct cusolver_error : public raft::exception { + explicit cusolver_error(char const* const message) + : raft::exception(message) {} + explicit cusolver_error(std::string const& message) + : raft::exception(message) {} +}; + namespace linalg { namespace detail { @@ -53,25 +64,39 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { #undef _CUSOLVER_ERR_TO_STR -/** check for cusolver runtime API errors and assert accordingly */ -#define CUSOLVER_CHECK(call) \ - do { \ - cusolverStatus_t err = call; \ - ASSERT(err == CUSOLVER_STATUS_SUCCESS, \ - "CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::detail::cusolver_error_to_string(err)); \ +/** + * @brief Error checking macro for cuSOLVER runtime API functions. + * + * Invokes a cuSOLVER runtime API function call, if the call does not return + * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred + */ +#define CUSOLVER_TRY(call) \ + do { \ + cusolverStatus_t const status = (call); \ + if (CUSOLVER_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "cuSOLVER error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, raft::linalg::detail::cusolver_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ + } while(0) + +/** FIXME: temporary alias for cuML compatibility */ +#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call) + +//@todo: enable this once logging is enabled +#if 0 +** check for cusolver runtime API errors but do not assert */ +define CUSOLVER_CHECK_NO_THROW(call) \ + do { \ + cusolverStatus_t err = call; \ + if (err != CUSOLVER_STATUS_SUCCESS) { \ + CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ + raft::linalg::detail::cusolver_error_to_string(err)); \ + } \ } while (0) - -///@todo: enable this once logging is enabled -// /** check for cusolver runtime API errors but do not assert */ -// #define CUSOLVER_CHECK_NO_THROW(call) \ -// do { \ -// cusolverStatus_t err = call; \ -// if (err != CUSOLVER_STATUS_SUCCESS) { \ -// CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::linalg::detail::cusolver_error_to_string(err)); \ -// } \ -// } while (0) +#endif namespace raft { namespace linalg { diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index a4a8173b88..a337c8289b 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -16,16 +16,28 @@ #pragma once +#include + #include ///@todo: enable this once logging is enabled //#include -#include #define _CUSPARSE_ERR_TO_STR(err) \ case err: \ return #err; namespace raft { + +/** + * @brief Exception thrown when a cuSparse error is encountered. + */ +struct cusparse_error : public raft::exception { + explicit cusparse_error(char const* const message) + : raft::exception(message) {} + explicit cusparse_error(std::string const& message) + : raft::exception(message) {} +}; + namespace sparse { namespace detail { @@ -54,25 +66,39 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { #undef _CUSPARSE_ERR_TO_STR -/** check for cusparse runtime API errors and assert accordingly */ -#define CUSPARSE_CHECK(call) \ - do { \ - cusparseStatus_t err = call; \ - ASSERT(err == CUSPARSE_STATUS_SUCCESS, \ - "CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ +/** + * @brief Error checking macro for cuSparse runtime API functions. + * + * Invokes a cuSparse runtime API function call, if the call does not return + * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred + */ +#define CUSPARSE_TRY(call) \ + do { \ + cusparseStatus_t const status = (call); \ + if (CUSPARSE_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "cuSparse error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, raft::sparse::detail::cusparse_error_to_string(status)); \ + throw raft::cusparse_error(msg); \ + } \ + } while(0) + +/** FIXME: temporary alias for cuML compatibility */ +#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call) + +//@todo: enable this once logging is enabled +#if 0 +/** check for cusparse runtime API errors but do not assert */ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + CUML_LOG_ERROR("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) - -///@todo: enable this once logging is enabled -// /** check for cusparse runtime API errors but do not assert */ -// #define CUSPARSE_CHECK_NO_THROW(call) \ -// do { \ -// cusparseStatus_t err = call; \ -// if (err != CUSPARSE_STATUS_SUCCESS) { \ -// CUML_LOG_ERROR("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ -// raft::sparse::detail::cusparse_error_to_string(err)); \ -// } \ -// } while (0) +#endif namespace raft { namespace sparse { From d3192f42de52b89aa7a119ad54d8c4f4fe28db4c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 17:07:06 -0400 Subject: [PATCH 096/189] clang-format --- cpp/include/raft/comms/std_comms.hpp | 26 ++++++++--------- cpp/include/raft/cudart_utils.h | 32 ++++++++++----------- cpp/include/raft/linalg/cublas_wrappers.h | 7 ++--- cpp/include/raft/linalg/cusolver_wrappers.h | 26 ++++++++--------- cpp/include/raft/sparse/cusparse_wrappers.h | 22 +++++++------- 5 files changed, 54 insertions(+), 59 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 4aa3f20772..89987a1db7 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -52,10 +52,8 @@ namespace raft { * @brief Exception thrown when a NCCL error is encountered. */ struct nccl_error : public raft::exception { - explicit nccl_error(char const* const message) - : raft::exception(message) {} - explicit nccl_error(std::string const& message) - : raft::exception(message) {} + explicit nccl_error(char const *const message) : raft::exception(message) {} + explicit nccl_error(std::string const &message) : raft::exception(message) {} }; }; // namespace raft @@ -66,16 +64,16 @@ struct nccl_error : public raft::exception { * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, ncclGetErrorString(status)); \ - throw raft::nccl_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, ncclGetErrorString(status)); \ + throw raft::nccl_error(msg); \ + } \ } while (0); /** FIXME: temporary alias for cuML compatibility */ diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 93cf87b8cb..2ea7cb3612 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -32,13 +32,11 @@ namespace raft { * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public raft::exception { - explicit cuda_error(char const* const message) - : raft::exception(message) {} - explicit cuda_error(std::string const& message) - : raft::exception(message) {} + explicit cuda_error(char const* const message) : raft::exception(message) {} + explicit cuda_error(std::string const& message) : raft::exception(message) {} }; -} +} // namespace raft /** * @brief Error checking macro for CUDA runtime API functions. @@ -48,18 +46,18 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", \ - #call, cudaGetErrorName(status), cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ - } while(0) +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ + cudaGetErrorName(status), cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ + } while (0) /** * @brief Debug macro to check for CUDA errors diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index c8d51dca57..83f600a49d 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -34,9 +34,8 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const* const message) - : raft::exception(message) {} - explicit cublas_error(std::string const& message) + explicit cublas_error(char const *const message) : raft::exception(message) {} + explicit cublas_error(std::string const &message) : raft::exception(message) {} }; @@ -82,7 +81,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ throw raft::cublas_error(msg); \ } \ - } while(0) + } while (0) /** FIXME: temporary alias for cuML compatibility */ #define CUBLAS_CHECK(call) CUBLAS_TRY(call) diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index da5455444b..eed8f9efd2 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -32,9 +32,9 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const* const message) + explicit cusolver_error(char const *const message) : raft::exception(message) {} - explicit cusolver_error(std::string const& message) + explicit cusolver_error(std::string const &message) : raft::exception(message) {} }; @@ -70,17 +70,17 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { * Invokes a cuSOLVER runtime API function call, if the call does not return * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred */ -#define CUSOLVER_TRY(call) \ - do { \ - cusolverStatus_t const status = (call); \ - if (CUSOLVER_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuSOLVER error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::linalg::detail::cusolver_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ - } while(0) +#define CUSOLVER_TRY(call) \ + do { \ + cusolverStatus_t const status = (call); \ + if (CUSOLVER_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", #call, status, \ + raft::linalg::detail::cusolver_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ + } while (0) /** FIXME: temporary alias for cuML compatibility */ #define CUSOLVER_CHECK(call) CUSOLVER_TRY(call) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index a337c8289b..3b174b4b13 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -72,17 +72,17 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { * Invokes a cuSparse runtime API function call, if the call does not return * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred */ -#define CUSPARSE_TRY(call) \ - do { \ - cusparseStatus_t const status = (call); \ - if (CUSPARSE_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuSparse error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::sparse::detail::cusparse_error_to_string(status)); \ - throw raft::cusparse_error(msg); \ - } \ - } while(0) +#define CUSPARSE_TRY(call) \ + do { \ + cusparseStatus_t const status = (call); \ + if (CUSPARSE_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", #call, status, \ + raft::sparse::detail::cusparse_error_to_string(status)); \ + throw raft::cusparse_error(msg); \ + } \ + } while (0) /** FIXME: temporary alias for cuML compatibility */ #define CUSPARSE_CHECK(call) CUSPARSE_TRY(call) From ec0cf978c3d4bfbabfa20111d6ef41172b600c19 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 17:37:09 -0400 Subject: [PATCH 097/189] cosmetic updates --- cpp/include/raft/cudart_utils.h | 2 +- cpp/include/raft/error.hpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 2ea7cb3612..329e7e2354 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -16,7 +16,7 @@ #pragma once -#include "raft/error.hpp" +#include #include diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 480805f35f..3801792fbc 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -37,19 +37,19 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(const exception& src) noexcept + exception(exception const& src) noexcept : std::exception(), msg_(src.what()) { collect_call_stack(); } /** ctor from an input message */ - explicit exception(const std::string _msg) noexcept - : std::exception(), msg_(std::move(_msg)) { + explicit exception(std::string const msg) noexcept + : std::exception(), msg_(std::move(msg)) { collect_call_stack(); } /** get the message associated with this exception */ - const char* what() const noexcept override { return msg_.c_str(); } + char const* what() const noexcept override { return msg_.c_str(); } private: /** message associated with this exception */ From f8f8d32313eef5b4f1926ec3d67a39ff09012c83 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 17:51:30 -0400 Subject: [PATCH 098/189] cosmetic updates --- cpp/include/raft/comms/std_comms.hpp | 8 ++++---- cpp/include/raft/cudart_utils.h | 6 +++--- cpp/include/raft/linalg/cublas_wrappers.h | 10 +++++----- cpp/include/raft/linalg/cusolver_wrappers.h | 10 +++++----- cpp/include/raft/sparse/cusparse_wrappers.h | 10 +++++----- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 89987a1db7..7a9d834d02 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -52,11 +52,11 @@ namespace raft { * @brief Exception thrown when a NCCL error is encountered. */ struct nccl_error : public raft::exception { - explicit nccl_error(char const *const message) : raft::exception(message) {} - explicit nccl_error(std::string const &message) : raft::exception(message) {} + explicit nccl_error(char const* const message) : raft::exception(message) {} + explicit nccl_error(std::string const& message) : raft::exception(message) {} }; -}; // namespace raft +} // namespace raft /** * @brief Error checking macro for NCCL runtime API functions. @@ -82,7 +82,7 @@ struct nccl_error : public raft::exception { #define NCCL_CHECK_NO_THROW(call) \ do { \ ncclResult_t status = call; \ - if (status != ncclSuccess) { \ + if (ncclSuccess != status) { \ printf("NCCL call='%s' failed. Reason:%s\n", #call, \ ncclGetErrorString(status)); \ } \ diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 329e7e2354..f9d99987f1 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -89,8 +89,8 @@ struct cuda_error : public raft::exception { // */ #define CUDA_CHECK_NO_THROW(call) \ do { \ - cudaError_t status = call; \ - if (status != cudaSuccess) { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ __FILE__, __LINE__, cudaGetErrorString(status)); \ } \ @@ -188,4 +188,4 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ -}; // namespace raft +} // namespace raft diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 83f600a49d..7e8a52196a 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -59,9 +59,9 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { }; } -}; // namespace detail -}; // namespace linalg -}; // namespace raft +} // namespace detail +} // namespace linalg +} // namespace raft #undef _CUBLAS_ERR_TO_STR @@ -578,5 +578,5 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, } /** @} */ -}; // namespace linalg -}; // namespace raft +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index eed8f9efd2..a65042a2fd 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -58,9 +58,9 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { }; } -}; // namespace detail -}; // namespace linalg -}; // namespace raft +} // namespace detail +} // namespace linalg +} // namespace raft #undef _CUSOLVER_ERR_TO_STR @@ -718,5 +718,5 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT } /** @} */ -}; // namespace linalg -}; // namespace raft +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 3b174b4b13..9de242ea10 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -60,9 +60,9 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { #endif // CUDART_VERSION } -}; // namespace detail -}; // namespace sparse -}; // namespace raft +} // namespace detail +} // namespace sparse +} // namespace raft #undef _CUSPARSE_ERR_TO_STR @@ -198,5 +198,5 @@ inline cusparseStatus_t cusparsegemmi( } /** @} */ -}; // namespace sparse -}; // namespace raft +} // namespace sparse +} // namespace raft From c3f153da98867db21087cf4ac5346aa4684fb192 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 17:58:51 -0400 Subject: [PATCH 099/189] stifle some warnings --- cpp/include/raft/comms/std_comms.hpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7a9d834d02..30644d8889 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -92,8 +92,6 @@ namespace raft { namespace comms { static size_t get_datatype_size(const datatype_t datatype) { - size_t ret = -1; - switch (datatype) { case datatype_t::CHAR: return sizeof(char); @@ -112,7 +110,7 @@ static size_t get_datatype_size(const datatype_t datatype) { case datatype_t::FLOAT64: return sizeof(double); default: - throw "Unsupported"; + RAFT_FAIL("Unsupported datatype."); } } @@ -172,13 +170,13 @@ class std_comms : public comms_iface { const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), - ucp_worker_(ucp_worker), - ucp_eps_(eps), + stream_(stream), num_ranks_(num_ranks), rank_(rank), - device_allocator_(device_allocator), - stream_(stream), - next_request_id_(0) { + ucp_worker_(ucp_worker), + ucp_eps_(eps), + next_request_id_(0), + device_allocator_(device_allocator) { initialize(); }; @@ -192,10 +190,10 @@ class std_comms : public comms_iface { const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), + stream_(stream), num_ranks_(num_ranks), rank_(rank), - device_allocator_(device_allocator), - stream_(stream) { + device_allocator_(device_allocator) { initialize(); }; From 85c9b7d6c29db429bde187426fd09f934552c879 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 11 Jun 2020 18:26:59 -0400 Subject: [PATCH 100/189] clang-format error --- cpp/include/raft/comms/std_comms.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 30644d8889..7304f3bd4e 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -52,8 +52,8 @@ namespace raft { * @brief Exception thrown when a NCCL error is encountered. */ struct nccl_error : public raft::exception { - explicit nccl_error(char const* const message) : raft::exception(message) {} - explicit nccl_error(std::string const& message) : raft::exception(message) {} + explicit nccl_error(char const *const message) : raft::exception(message) {} + explicit nccl_error(std::string const &message) : raft::exception(message) {} }; } // namespace raft From 3361dc35d6808e7aec21bab4243c7b02468ca245 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 17:38:17 -0500 Subject: [PATCH 101/189] Update partition with eigen solver interface. --- cpp/include/raft/spectral/eigen_solvers.hpp | 87 +++++++++++++++++++++ cpp/include/raft/spectral/lanczos.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 22 +++--- 3 files changed, 99 insertions(+), 14 deletions(-) create mode 100644 cpp/include/raft/spectral/eigen_solvers.hpp diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp new file mode 100644 index 0000000000..fcb099a556 --- /dev/null +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft { + + using namespace matrix; + + // aggregate of control params for Eigen Solver: + // + template + struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; + + size_type_t restartIter; + value_type_t tol; + + bool reorthogonalize; + unsigned long long seed{1234567}; + }; + + template + struct lanczos_solver_t { + explicit lanczos_solver_t(eigen_solver_config_t const& config): + config_(config) + { + } + + index_type_t solve_smallest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed)); + return iters; + } + + index_type_t solve_largest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed)); + return iters; + } + + decltype(auto) get_config(void) const + { + return config_; + } + + private: + eigen_solver_config_t config_; + }; +} // namespace raft diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 54818e1766..c4ab61b78e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -32,11 +32,11 @@ namespace raft { -namespace { - using namespace matrix; using namespace linalg; +namespace { + // ========================================================= // Helper functions // ========================================================= diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 156be656bd..c8a284fbb1 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -15,8 +15,6 @@ */ #pragma once -#include "include/partition.hxx" - #include #include @@ -27,7 +25,7 @@ #include #include -#include +#include #include namespace raft { @@ -149,22 +147,21 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template , typename ClusterSolver = KmeansSolver> int partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, vertex_t nEigVecs, int maxIter_lanczos, - int restartIter_lanczos, weight_t tol_lanczos, int maxIter_kmeans, - weight_t tol_kmeans, vertex_t *__restrict__ parts, weight_t *eigVals, - weight_t *eigVecs) { + vertex_t nParts, EigenSolver eigen_solver, + int maxIter_kmeans, weight_t tol_kmeans, vertex_t *__restrict__ parts, + weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - int iters_lanczos; + int iters_eig_solver; int iters_kmeans; edge_t i; @@ -183,10 +180,11 @@ int partition( sparse_matrix_t A{graph}; laplacian_matrix_t L{handle, graph}; + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_configs.n_eigVecs; + // Compute smallest eigenvalues and eigenvectors - RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, - restartIter_lanczos, tol_lanczos, false, - iters_lanczos, eigVals, eigVecs)); + iter_eigs_solver = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { From 0373a52046ccdba7515b731eae7d95e67c6160ae Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 18:06:04 -0500 Subject: [PATCH 102/189] Format and some comments. --- cpp/include/raft/spectral/eigen_solvers.hpp | 109 +++++++++----------- 1 file changed, 48 insertions(+), 61 deletions(-) diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index fcb099a556..2f87c95b3a 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -19,69 +19,56 @@ namespace raft { - using namespace matrix; +using namespace matrix; - // aggregate of control params for Eigen Solver: - // - template - struct eigen_solver_config_t { - size_type_t n_eigVecs; - size_type_t maxIter; - - size_type_t restartIter; - value_type_t tol; +// aggregate of control params for Eigen Solver: +// +template +struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; - bool reorthogonalize; - unsigned long long seed{1234567}; - }; + size_type_t restartIter; + value_type_t tol; - template - struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config): - config_(config) - { - } + bool reorthogonalize{false}; + unsigned long long seed{ + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations +}; - index_type_t solve_smallest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { - index_type_t iters{}; - RAFT_TRY(computeSmallestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed)); - return iters; - } - - index_type_t solve_largest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { - index_type_t iters{}; - RAFT_TRY(computeLargestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed)); - return iters; - } +template +struct lanczos_solver_t { + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) {} - decltype(auto) get_config(void) const - { - return config_; - } - - private: - eigen_solver_config_t config_; - }; -} // namespace raft + index_type_t solve_smallest_eigenvectors( + handle_t handle, sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeSmallestEigenvectors( + handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, + config_.tol, config_.reorthogonalize, iters, eigVals, eigVecs, + config_.seed)); + return iters; + } + + index_type_t solve_largest_eigenvectors( + handle_t handle, sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, + config_.maxIter, config_.restartIter, + config_.tol, config_.reorthogonalize, + iters, eigVals, eigVecs, config_.seed)); + return iters; + } + + decltype(auto) get_config(void) const { return config_; } + + private: + eigen_solver_config_t config_; +}; +} // namespace raft From 308e893e587f0b55ca576842ca6687219a0ec96a Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 21:54:38 -0500 Subject: [PATCH 103/189] Added generic cluster solvers. Partition clean-up. --- cpp/include/raft/spectral/cluster_solvers.hpp | 63 +++++++++++++++++++ cpp/include/raft/spectral/eigen_solvers.hpp | 8 ++- cpp/include/raft/spectral/partition.hpp | 37 ++++++----- 3 files changed, 86 insertions(+), 22 deletions(-) create mode 100644 cpp/include/raft/spectral/cluster_solvers.hpp diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp new file mode 100644 index 0000000000..cd0963506f --- /dev/null +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include // for std::pair + +namespace raft { + +using namespace matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct cluster_solver_config_t { + size_type_t n_clusters; + size_type_t maxIter; + + value_type_t tol; + + unsigned long long seed{123456}; +}; + +template +struct kmeans_solver_t { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) {} + + template + std::pair solve( + handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, + size_type_t dim, value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const { + value_type_t residual{}; + index_type_t iters{}; + RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, + config_.seed)); + return std::make_pair(residual, iters); + } + + auto const& get_config(void) const { return config_; } + + private: + cluster_solver_config_t config_; +}; +} // namespace raft diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 2f87c95b3a..9c1258c432 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -46,7 +46,8 @@ struct lanczos_solver_t { index_type_t solve_smallest_eigenvectors( handle_t handle, sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; RAFT_TRY(computeSmallestEigenvectors( handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -57,7 +58,8 @@ struct lanczos_solver_t { index_type_t solve_largest_eigenvectors( handle_t handle, sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -66,7 +68,7 @@ struct lanczos_solver_t { return iters; } - decltype(auto) get_config(void) const { return config_; } + auto const& get_config(void) const { return config_; } private: eigen_solver_config_t config_; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index c8a284fbb1..0e858ac90d 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -24,7 +24,9 @@ #include #include -#include +#include + +#include #include #include @@ -147,29 +149,26 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template , - typename ClusterSolver = KmeansSolver> -int partition( + typename ThrustExePolicy, + typename EigenSolver = lanczos_solver_t, + typename ClusterSolver = kmeans_solver_t> +std::tuple partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, EigenSolver eigen_solver, - int maxIter_kmeans, weight_t tol_kmeans, vertex_t *__restrict__ parts, - weight_t *eigVals, weight_t *eigVecs) { + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ parts, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - int iters_eig_solver; - int iters_kmeans; + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver edge_t i; edge_t n = graph.number_of_vertices; - // k-means residual - weight_t residual_kmeans; - // ------------------------------------------------------- // Spectral partitioner // ------------------------------------------------------- @@ -184,7 +183,7 @@ int partition( auto nEigVecs = eigen_configs.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - iter_eigs_solver = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + stats.get<0>() = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { @@ -233,14 +232,14 @@ int partition( cudaMemcpyDeviceToDevice, stream)); } - // Clean up + // Find partition with clustering + auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, + nEigVecs, eigVecs, parts); - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - RAFT_TRY(kmeans(n, nEigVecs, nParts, tol_kmeans, maxIter_kmeans, eigVecs, - parts, residual_kmeans, iters_kmeans)); + stats.get<1>() = pair_cluster.first; + stats.get<2>() = pair_cluster.second; - return 0; + return stats; } // ========================================================= From 8fb1c0425c644158017ebad5b9e8aa5b92645931 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 22:01:55 -0500 Subject: [PATCH 104/189] Fixed tuple. --- cpp/include/raft/spectral/partition.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 0e858ac90d..ea5bcdd1d8 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -183,7 +183,8 @@ std::tuple partition( auto nEigVecs = eigen_configs.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - stats.get<0>() = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + std::get<0>(stats) = + eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { @@ -236,8 +237,8 @@ std::tuple partition( auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, nEigVecs, eigVecs, parts); - stats.get<1>() = pair_cluster.first; - stats.get<2>() = pair_cluster.second; + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; return stats; } From 25756ecafbd566f89dfd46216672c2c105693158 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 12 Jun 2020 16:59:13 -0500 Subject: [PATCH 105/189] Modularity Maximization refactor. More cleanup in matrix wrappers and partition. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 216 +++++++++- .../raft/spectral/modularity_maximization.hpp | 383 ++++-------------- cpp/include/raft/spectral/partition.hpp | 184 ++------- 3 files changed, 320 insertions(+), 463 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 04fd8cc185..68cd829949 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,6 +18,7 @@ #include #include #include +#include // ========================================================= // Useful macros @@ -27,6 +28,153 @@ #define IDX(i, j, lda) ((i) + (j) * (lda)) namespace raft { + +namespace { + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_* obs) { + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) { + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + CUDA_CHECK_LAST(); + + return cudaSuccess; +} + +template +void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + CUDA_CHECK_LAST(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + vector_t work(handle, nEigVecs * n); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); + } +} + +} // namespace + namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header @@ -109,7 +257,10 @@ struct sparse_matrix_t { // virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const { - //TODO: call cusparse::csrmv + //TODO: + // + //Cusparse::set_pointer_mode_host(); + //cusparsecsrmv(...); } //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate @@ -131,20 +282,53 @@ struct laplacian_matrix_t : sparse_matrix_t { nrows, nnz), diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); - //TODO: more work, here... + //TODO: more work, here: + // + // vector_t ones(nrows); + // ones.fill(1.0); + // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } laplacian_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) : sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_) {} + diagonal_(raft_handle, csr_view.number_of_vertices_) { + //TODO: more work, here: + // + // vector_t ones(csr_view.number_of_vertices_); + // ones.fill(1.0); + // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv ... and more + //TODO: call cusparse::csrmv ... and more: + // + // if (beta == 0) + // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) + // else if (beta != 1) + // thrust::transform(thrust::device_pointer_cast(y), + // thrust::device_pointer_cast(y + this->n), + // thrust::make_constant_iterator(beta), + // thrust::device_pointer_cast(y), + // thrust::multiplies()); + + // // Apply diagonal matrix + // dim3 gridDim, blockDim; + // gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + // gridDim.y = 1; + // gridDim.z = 1; + // blockDim.x = BLOCK_SIZE; + // blockDim.y = 1; + // blockDim.z = 1; + // diagmv<<s>>>(this->n, alpha, D.raw(), x, y); + // cudaCheckError(); + + // // Apply adjacency matrix + // sparse_matrix_t::mv(-alpha, x, 1, y); } vector_t diagonal_; @@ -159,20 +343,38 @@ struct modularity_matrix_t : laplacian_matrix_t { : laplacian_matrix_t( raft_handle, row_offsets, col_indices, values, nrows, nnz) { auto* v = laplacian_matrix_t::diagonal_.raw(); - //TODO: more work, here... + //TODO: more work, here: + // + // diag_nrm1_ = diagonal_.nrm1(); } modularity_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) - : laplacian_matrix_t(raft_handle, csr_view) {} + : laplacian_matrix_t(raft_handle, csr_view) { + //TODO: more work, here: + // + // diag_nrm1_ = diagonal_.nrm1(); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv ... and more + //TODO: call cusparse::csrmv ... and more: + // + // // y = A*x + // sparse_matrix_t::mv(alpha, x, 0, y); + // value_type dot_res; + // // gamma = d'*x + // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); + // // y = y -(gamma/edge_sum)*d + // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } + + value_type get_diag_nrm1(void) const { return diag_nrm1_; } + + value_type diag_nrm1_; }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index bd90f3093a..6b42f783c9 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,9 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION -#include "include/modularity_maximization.hxx" +#pragma once #include #include @@ -26,14 +25,10 @@ #include #include -#include "include/debug_macros.h" -#include "include/kmeans.hxx" -#include "include/lanczos.hxx" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" -#include "include/spectral_matrix.hxx" +#include + +#include +#include //#define COLLECT_TIME_STATISTICS 1 //#undef COLLECT_TIME_STATISTICS @@ -47,8 +42,7 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) -{ +static double timer(void) { struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -56,100 +50,10 @@ static double timer(void) } #endif -namespace nvgraph { - -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) +namespace raft { -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) -{ - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); - - return cudaSuccess; -} +using namespace matrix; +using namespace linalg; // ========================================================= // Spectral modularity_maximization @@ -168,129 +72,63 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * @param tol_lanczos Convergence tolerance for Lanczos method. * @param maxIter_kmeans Maximum number of k-means iterations. * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Cluster + * @param clusters (Output, device memory, n entries) Cluster * assignments. * @param iters_lanczos On exit, number of Lanczos iterations * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ clusters, - weight_t *eigVals, - weight_t *eigVecs, - int &iters_lanczos, - int &iters_kmeans) -{ - cudaStream_t stream = 0; +template , + typename ClusterSolver = kmeans_solver_t> +std::tuple modularity_maximization( + handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; - edge_t i; - edge_t n = graph.number_of_vertices; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - // k-means residual - weight_t residual_kmeans; + edge_t n = graph.number_of_vertices; // Compute eigenvectors of Modularity Matrix + // Initialize Modularity Matrix - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); - - // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeLargestEigenvectors(B, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); - - // eigVals.dump(0, nEigVecs); - // eigVecs.dump(0, nEigVecs); - // eigVecs.dump(n, nEigVecs); - // eigVecs.dump(2*n, nEigVecs); - // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); - mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - cudaCheckError(); - } + sparse_matrix_t A{graph}; + modularity_matrix_t B{handle, graph}; - // Transpose eigenvector matrix - // TODO: in-place transpose - { - Vector work(nEigVecs * n, stream); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); - } + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_configs.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = + eigen_solver.solve_largest_eigenvectors(B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); - // WARNING: notice that at this point the matrix has already been transposed, so we are scaling + // notice that at this point the matrix has already been transposed, so we are scaling // columns scale_obs(nEigVecs, n, eigVecs); - cudaCheckError(); - - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nClusters, - tol_kmeans, - maxIter_kmeans, - eigVecs, - clusters, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; + CUDA_CHECK_LAST(); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; } //=================================================== // Analysis of graph partition @@ -307,9 +145,9 @@ struct equal_to_i_op { public: equal_to_i_op(IndexType_ _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; } }; } // namespace @@ -318,39 +156,33 @@ struct equal_to_i_op { /** This function determines the modularity based on a graph and cluster assignments * @param G Weighted graph in CSR format * @param nClusters Number of clusters. - * @param parts (Input, device memory, n entries) Cluster assignments. + * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ template -NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - const vertex_t *__restrict__ parts, - weight_t &modularity) -{ - cudaStream_t stream = 0; +void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + vertex_t nClusters, + const vertex_t *__restrict__ clusters, + weight_t &modularity) { edge_t i; edge_t n = graph.number_of_vertices; - weight_t partModularity, partSize; + weight_t partModularity, clustersize; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // Device memory - Vector part_i(n, stream); - Vector Bx(n, stream); + vector_t part_i(handle, n); + Vector Bx(handle, n); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); + sparse_matrix_t A{graph}; + modularity_matrix_t B{handle, graph}; // Initialize output modularity = 0; @@ -358,25 +190,30 @@ NVGRAPH_ERROR analyzeModularity( // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - cudaCheckError(); + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + CUDA_CHECK_LAST(); // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); - partSize = round(partSize); - if (partSize < 0.5) { + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { WARNING("empty partition"); continue; } // Compute modularity B.mv(1, part_i.raw(), 0, Bx.raw()); - Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, + &partModularity, stream)); // Record results modularity += partModularity; @@ -384,53 +221,7 @@ NVGRAPH_ERROR analyzeModularity( } // modularity = modularity/nClusters; // devide by nnz - modularity = modularity / B.getEdgeSum(); - // Clean up and return - - return NVGRAPH_OK; + modularity = modularity / B.get_diag_nrm1(); } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - float &modularity); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - double &modularity); - -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION +} // namespace raft diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index ea5bcdd1d8..00b11f7740 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -28,100 +28,12 @@ #include #include -#include namespace raft { using namespace matrix; using namespace linalg; -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_ *obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); - - return cudaSuccess; -} - // ========================================================= // Spectral partitioner // ========================================================= @@ -132,7 +44,7 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) * * @param G Weighted graph in CSR format - * @param nParts Number of partitions. + * @param nClusters Number of partitions. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter_lanczos Maximum number of Lanczos iterations. * @param restartIter_lanczos Maximum size of Lanczos system before @@ -140,13 +52,13 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @param tol_lanczos Convergence tolerance for Lanczos method. * @param maxIter_kmeans Maximum number of k-means iterations. * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition + * @param clusters (Output, device memory, n entries) Partition * assignments. * @param iters_lanczos On exit, number of Lanczos iterations * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return error flag. + * @return statistics: number of eigensolver iterations, . */ template > std::tuple partition( handle_t handle, ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, + GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ parts, weight_t *eigVals, weight_t *eigVecs) { + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; @@ -184,58 +96,14 @@ std::tuple partition( // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + eigen_solver.solve_smallest_eigenvectors(L, eigVals, eigVecs); // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); - mean /= n; - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - CUDA_CHECK_LAST(); - - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); - - std /= std::sqrt(static_cast(n)); - - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - CUDA_CHECK_LAST(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t *)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); - } - - // Find partition with clustering - auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, - nEigVecs, eigVecs, parts); + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -272,26 +140,25 @@ struct equal_to_i_op { * Graph is assumed to be weighted and undirected. * * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition * assignments. * @param edgeCut On exit, weight of edges cut by partition. * @param cost On exit, partition cost function. * @return error flag. */ template -int analyzePartition( - handle_t handle, ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, const vertex_t *__restrict__ parts, weight_t &edgeCut, - weight_t &cost) { +void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + vertex_t nClusters, const vertex_t *__restrict__ clusters, + weight_t &edgeCut, weight_t &cost) { edge_t i; edge_t n = graph.number_of_vertices; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - weight_t partEdgesCut, partSize; + weight_t partEdgesCut, clustersize; // Device memory vector_t part_i(handle, n); @@ -310,24 +177,24 @@ int analyzePartition( edgeCut = 0; // Iterate through partitions - for (i = 0; i < nParts; ++i) { + for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition thrust::for_each(thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(clusters), thrust::device_pointer_cast(part_i.raw()))), thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(i)); CUDA_CHECK_LAST(); // Compute size of ith partition CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &partSize, stream)); + &clustersize, stream)); - partSize = round(partSize); - if (partSize < 0.5) { + clustersize = round(clustersize); + if (clustersize < 0.5) { WARNING("empty partition"); continue; } @@ -338,12 +205,9 @@ int analyzePartition( &partEdgesCut, stream)); // Record results - cost += partEdgesCut / partSize; + cost += partEdgesCut / clustersize; edgeCut += partEdgesCut / 2; } - - // Clean up and return - return 0; } } // namespace raft From 531bf2bdd74115b837772512d3f12b6ef4fb5639 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 12 Jun 2020 21:07:41 -0500 Subject: [PATCH 106/189] More refactoring in partition/modularity analysis. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 149 +----------- .../raft/spectral/modularity_maximization.hpp | 31 +-- cpp/include/raft/spectral/partition.hpp | 44 +--- cpp/include/raft/spectral/spectral_util.hpp | 230 ++++++++++++++++++ 4 files changed, 243 insertions(+), 211 deletions(-) create mode 100644 cpp/include/raft/spectral/spectral_util.hpp diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 68cd829949..779fbb9dc8 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include // ========================================================= // Useful macros @@ -28,153 +28,6 @@ #define IDX(i, j, lda) ((i) + (j) * (lda)) namespace raft { - -namespace { - -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_* obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); - - return cudaSuccess; -} - -template -void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // Whiten eigenvector matrix - for (auto i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); - mean /= n; - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - CUDA_CHECK_LAST(); - - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); - - std /= std::sqrt(static_cast(n)); - - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - CUDA_CHECK_LAST(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); - } -} - -} // namespace - namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 6b42f783c9..1e387c0606 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -29,6 +29,7 @@ #include #include +#include //#define COLLECT_TIME_STATISTICS 1 //#undef COLLECT_TIME_STATISTICS @@ -51,6 +52,7 @@ static double timer(void) { #endif namespace raft { +namespace spectral { using namespace matrix; using namespace linalg; @@ -163,7 +165,7 @@ template void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, - const vertex_t *__restrict__ clusters, + vertex_t const *__restrict__ clusters, weight_t &modularity) { edge_t i; edge_t n = graph.number_of_vertices; @@ -174,7 +176,7 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Device memory vector_t part_i(handle, n); - Vector Bx(handle, n); + vector_t Bx(handle, n); // Initialize cuBLAS CUBLAS_CHECK( @@ -189,32 +191,12 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - // Construct indicator vector for ith partition - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - CUDA_CHECK_LAST(); - - // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); - - clustersize = round(clustersize); - if (clustersize < 0.5) { + if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } - // Compute modularity - B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, - &partModularity, stream)); - // Record results modularity += partModularity; // std::cout<< "partModularity " < #include +#include namespace raft { +namespace spectral { using namespace matrix; using namespace linalg; @@ -115,24 +117,6 @@ std::tuple partition( // Analysis of graph partition // ========================================================= -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - /// Compute cost function for partition /** This function determines the edges cut by a partition and a cost * function: @@ -179,35 +163,17 @@ void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - CUDA_CHECK_LAST(); - - // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); - - clustersize = round(clustersize); - if (clustersize < 0.5) { + if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } - // Compute number of edges cut by ith partition - L.mv(1, part_i.raw(), 0, Lx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Lx.raw(), 1, part_i.raw(), 1, - &partEdgesCut, stream)); - // Record results cost += partEdgesCut / clustersize; edgeCut += partEdgesCut / 2; } } +} // namespace spectral } // namespace raft diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp new file mode 100644 index 0000000000..a0c10284e3 --- /dev/null +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace raft { +namespace spectral { + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_* obs) { + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) { + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + CUDA_CHECK_LAST(); + + return cudaSuccess; +} + +template +void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + CUDA_CHECK_LAST(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + vector_t work(handle, nEigVecs * n); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); + } +} + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +// Construct indicator vector for ith partition +// +template +bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, weight_t& clustersize, weight_t& partStats, + vertex_t const* __restrict__ clusters, + vector_t& part_i, vector_t& Bx, + laplacian_matrix_t const& B) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + CUDA_CHECK_LAST(); + + // Compute size of ith partition + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { + return false; + } + + // Compute part stats + B.mv(1, part_i.raw(), 0, Bx.raw()); + CUBLAS_CHECK( + cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + + return true; +} + +} // namespace spectral +} // namespace raft From 1fb5ae2844a18eabc846aecd7f812e7a665f6f9b Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Mon, 15 Jun 2020 10:46:17 -0500 Subject: [PATCH 107/189] set to OrderedDict --- CHANGELOG.md | 1 + python/raft/dask/common/comms.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47c1ee0023..56a6439040 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## Improvements - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM +- PR #22: Preserve order in comms workers for rank initialization ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py index b49cb8d7b9..7f86c5421f 100644 --- a/python/raft/dask/common/comms.py +++ b/python/raft/dask/common/comms.py @@ -28,6 +28,7 @@ import time import uuid +from collections import OrderedDict class Comms: @@ -137,7 +138,7 @@ def init(self, workers=None): Unique collection of workers for initializing comms. """ - self.worker_addresses = list(set( + self.worker_addresses = list(OrderedDict.fromkeys( self.client.scheduler_info()["workers"].keys() if workers is None else workers)) From 4fa1c76b3087c6f370e29f94302c4b82ee715182 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:30:58 -0500 Subject: [PATCH 108/189] Removed MPI dependency (for now). --- cpp/include/raft/graph.hpp | 197 ++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 111 deletions(-) diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp index 8e72572764..089decc8ee 100644 --- a/cpp/include/raft/graph.hpp +++ b/cpp/include/raft/graph.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once -#include +/// #include // TODO: clarify what must be done about `comm` #include #include #include @@ -55,7 +55,7 @@ template class GraphViewBase { public: WT *edge_data; ///< edge weight - Comm comm; + /// Comm comm; // TODO: clarify what must be done about `comm` GraphProperties prop; @@ -69,16 +69,14 @@ class GraphViewBase { * identifiers */ void get_vertex_identifiers(VT *identifiers) const; - void set_communicator(Comm &comm_) { comm = comm_; } + /// void set_communicator(Comm &comm_) { comm = comm_; } // TODO: see above GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) : edge_data(edge_data_), - comm(), + /// comm(), // TODO: see above prop(), number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) - { - } + number_of_edges(number_of_edges_) {} bool has_data(void) const { return edge_data != nullptr; } }; @@ -126,13 +124,12 @@ class GraphCOOView : public GraphViewBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOOView( - VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + GraphCOOView(VT *src_indices_, VT *dst_indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, + number_of_edges_), src_indices(src_indices_), - dst_indices(dst_indices_) - { - } + dst_indices(dst_indices_) {} }; /** @@ -187,13 +184,12 @@ class GraphCompressedSparseBaseView : public GraphViewBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBaseView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + GraphCompressedSparseBaseView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, + number_of_edges_), offsets{offsets_}, - indices{indices_} - { - } + indices{indices_} {} }; /** @@ -209,7 +205,9 @@ class GraphCSRView : public GraphCompressedSparseBaseView { /** * @brief Default constructor */ - GraphCSRView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSRView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, + 0) {} /** * @brief Wrap existing arrays representing adjacency lists in a Graph. @@ -226,11 +224,10 @@ class GraphCSRView : public GraphCompressedSparseBaseView { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSRView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + GraphCSRView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - { + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { } }; @@ -247,7 +244,9 @@ class GraphCSCView : public GraphCompressedSparseBaseView { /** * @brief Default constructor */ - GraphCSCView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSCView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, + 0) {} /** * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. @@ -264,11 +263,10 @@ class GraphCSCView : public GraphCompressedSparseBaseView { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSCView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + GraphCSCView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - { + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { } }; @@ -323,30 +321,28 @@ class GraphCOO { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOO(VT number_of_vertices, - ET number_of_edges, - bool has_data = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCOO( + VT number_of_vertices, ET number_of_edges, bool has_data = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : number_of_vertices_(number_of_vertices), number_of_edges_(number_of_edges), src_indices_(sizeof(VT) * number_of_edges, stream, mr), dst_indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) - { - } + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - GraphCOO(GraphCOOView const &graph, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCOO( + GraphCOOView const &graph, cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : number_of_vertices_(graph.number_of_vertices), number_of_edges_(graph.number_of_edges), - src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), stream, mr), - dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), stream, mr) - { + src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), + stream, mr), + dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), + stream, mr) { if (graph.has_data()) { - edge_data_ = - rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; + edge_data_ = rmm::device_buffer{ + graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; } } @@ -356,24 +352,21 @@ class GraphCOO { VT *dst_indices(void) { return static_cast(dst_indices_.data()); } WT *edge_data(void) { return static_cast(edge_data_.data()); } - GraphCOOContents release() noexcept - { + GraphCOOContents release() noexcept { VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; return GraphCOOContents{ - number_of_vertices, - number_of_edges, + number_of_vertices, number_of_edges, std::make_unique(std::move(src_indices_)), std::make_unique(std::move(dst_indices_)), std::make_unique(std::move(edge_data_))}; } - GraphCOOView view(void) noexcept - { - return GraphCOOView( - src_indices(), dst_indices(), edge_data(), number_of_vertices_, number_of_edges_); + GraphCOOView view(void) noexcept { + return GraphCOOView(src_indices(), dst_indices(), edge_data(), + number_of_vertices_, number_of_edges_); } bool has_data(void) { return nullptr != edge_data_.data(); } @@ -420,27 +413,21 @@ class GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBase(VT number_of_vertices, - ET number_of_edges, - bool has_data, - cudaStream_t stream, + GraphCompressedSparseBase(VT number_of_vertices, ET number_of_edges, + bool has_data, cudaStream_t stream, rmm::mr::device_memory_resource *mr) : number_of_vertices_(number_of_vertices), number_of_edges_(number_of_edges), offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) - { - } + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} GraphCompressedSparseBase(GraphSparseContents &&contents) : number_of_vertices_(contents.number_of_vertices), number_of_edges_(contents.number_of_edges), offsets_(std::move(*contents.offsets.release())), indices_(std::move(*contents.indices.release())), - edge_data_(std::move(*contents.edge_data.release())) - { - } + edge_data_(std::move(*contents.edge_data.release())) {} VT number_of_vertices(void) { return number_of_vertices_; } ET number_of_edges(void) { return number_of_edges_; } @@ -448,15 +435,13 @@ class GraphCompressedSparseBase { VT *indices(void) { return static_cast(indices_.data()); } WT *edge_data(void) { return static_cast(edge_data_.data()); } - GraphSparseContents release() noexcept - { + GraphSparseContents release() noexcept { VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; return GraphSparseContents{ - number_of_vertices, - number_of_edges, + number_of_vertices, number_of_edges, std::make_unique(std::move(offsets_)), std::make_unique(std::move(indices_)), std::make_unique(std::move(edge_data_))}; @@ -493,28 +478,23 @@ class GraphCSR : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSR(VT number_of_vertices_, - ET number_of_edges_, - bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCSR( + VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) - { - } + number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} GraphCSR(GraphSparseContents &&contents) - : GraphCompressedSparseBase(std::move(contents)) - { - } - - GraphCSRView view(void) noexcept - { - return GraphCSRView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + : GraphCompressedSparseBase(std::move(contents)) {} + + GraphCSRView view(void) noexcept { + return GraphCSRView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; @@ -546,28 +526,23 @@ class GraphCSC : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSC(VT number_of_vertices_, - ET number_of_edges_, - bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCSC( + VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) - { - } + number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} GraphCSC(GraphSparseContents &&contents) - : GraphCompressedSparseBase(contents) - { - } - - GraphCSCView view(void) noexcept - { - return GraphCSCView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + : GraphCompressedSparseBase(contents) {} + + GraphCSCView view(void) noexcept { + return GraphCSCView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; From 10496dedbe4839026a25c8f2f476d15add79aa0d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:42:44 -0500 Subject: [PATCH 109/189] Fixed sparse matrix cnstr. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 779fbb9dc8..0f3a7d1e4c 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -97,11 +97,11 @@ struct sparse_matrix_t { sparse_matrix_t( GraphCSRView const& csr_view) - : row_offsets_(csr_view.offsets_), - col_indices_(csr_view.indices_), - values_(csr_view.edge_data_), - nrows_(csr_view.number_of_vertices_), - nnz_(csr_view.number_of_edges_) {} + : row_offsets_(csr_view.offsets), + col_indices_(csr_view.indices), + values_(csr_view.edge_data), + nrows_(csr_view.number_of_vertices), + nnz_(csr_view.number_of_edges) {} virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types From 38e5ca9b53a01df67c97ba347b87dfdbdbc27ccb Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:52:41 -0500 Subject: [PATCH 110/189] Added test for spectral matrix functionality. Compilation checker for now. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 6 +- cpp/test/spectral_matrix.cpp | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 cpp/test/spectral_matrix.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 947d0318cb..de7f3e0e34 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,7 +205,8 @@ if(BUILD_RAFT_TESTS) test/handle.cpp test/mr/device/buffer.cpp test/mr/host/buffer.cpp - test/test.cpp) + test/test.cpp + test/spectral_matrix.cpp) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 0f3a7d1e4c..74dbd38be6 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -87,8 +87,8 @@ class vector_t { template struct sparse_matrix_t { sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, - value_type const* values, index_type const nnz, - index_type const nrows) + value_type const* values, index_type const nrows, + index_type const nnz) : row_offsets_(row_offsets), col_indices_(col_indices), values_(values), @@ -146,7 +146,7 @@ struct laplacian_matrix_t : sparse_matrix_t { handle_t const& raft_handle, GraphCSRView const& csr_view) : sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_) { + diagonal_(raft_handle, csr_view.number_of_vertices) { //TODO: more work, here: // // vector_t ones(csr_view.number_of_vertices_); diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp new file mode 100644 index 0000000000..24fd31875e --- /dev/null +++ b/cpp/test/spectral_matrix.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, SpectralMatrices) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + // ASSERT_EQ(nullptr, h.get_stream()); + // ASSERT_NE(nullptr, h.get_cublas_handle()); + // ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); + // ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); + // ASSERT_NE(nullptr, h.get_cusparse_handle()); + + int const sz = 10; + vector_t d_v{h, sz}; + + GraphCSRView empty_graph; + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{empty_graph}; + + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + laplacian_matrix_t lm2{h, empty_graph}; + + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; + modularity_matrix_t mm2{h, empty_graph}; +} + +} // namespace raft From 86dc155bb1c694e557afb211c31f3849cbc2e246 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 12:08:17 -0500 Subject: [PATCH 111/189] Heart-beat tests for spectral matrices. --- cpp/test/spectral_matrix.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp index 24fd31875e..46d753550f 100644 --- a/cpp/test/spectral_matrix.cpp +++ b/cpp/test/spectral_matrix.cpp @@ -31,11 +31,6 @@ TEST(Raft, SpectralMatrices) { handle_t h; ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); - // ASSERT_EQ(nullptr, h.get_stream()); - // ASSERT_NE(nullptr, h.get_cublas_handle()); - // ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); - // ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); - // ASSERT_NE(nullptr, h.get_cusparse_handle()); int const sz = 10; vector_t d_v{h, sz}; @@ -49,12 +44,18 @@ TEST(Raft, SpectralMatrices) { index_type nrows = 0; sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{empty_graph}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + ASSERT_EQ(nullptr, sm2.row_offsets_); laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; laplacian_matrix_t lm2{h, empty_graph}; + ASSERT_EQ(nullptr, lm1.diagonal_.raw()); + ASSERT_EQ(nullptr, lm2.diagonal_.raw()); modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; modularity_matrix_t mm2{h, empty_graph}; + ASSERT_EQ(nullptr, mm1.diagonal_.raw()); + ASSERT_EQ(nullptr, mm2.diagonal_.raw()); } } // namespace raft From 554554397d44a91fc50ef398e1ccf3f00123d01f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 16:04:55 -0500 Subject: [PATCH 112/189] Fixed lapack dependencies on dense cusolver. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/lanczos.hpp | 34 +- cpp/include/raft/spectral/lapack.hpp | 538 +++++++++----------------- cpp/test/spectral_solvers.cpp | 61 +++ 4 files changed, 272 insertions(+), 364 deletions(-) create mode 100644 cpp/test/spectral_solvers.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index de7f3e0e34..879b214e62 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -206,7 +206,8 @@ if(BUILD_RAFT_TESTS) test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/test.cpp - test/spectral_matrix.cpp) + test/spectral_matrix.cpp + test/spectral_solvers.cpp) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index c4ab61b78e..e9682f5c28 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace raft { @@ -69,7 +70,7 @@ namespace { */ template int performLanczosIteration( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, bool reorthogonalize, ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, @@ -82,13 +83,14 @@ int performLanczosIteration( const ValueType_ one = 1; const ValueType_ negOne = -1; const ValueType_ zero = 0; + ValueType_ alpha; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); RAFT_EXPECT(A != nullptr, "Null matrix pointer."); - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // ------------------------------------------------------- // Compute second Lanczos vector @@ -108,7 +110,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); - auto alpha = -alpha_host[0]; + alpha = -alpha_host[0]; CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, @@ -443,7 +445,7 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, */ template static int lanczosRestart( - handle_t handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, + handle_t const &handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, ValueType_ *shiftUpper, ValueType_ *shiftLower, ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, @@ -500,16 +502,16 @@ static int lanczosRestart( *shiftUpper = ritzVals_host[iter - 1]; *shiftLower = ritzVals_host[iter_new]; } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[iter_new]); + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); } } else { if (*shiftLower > *shiftUpper) { *shiftUpper = ritzVals_host[iter - iter_new - 1]; *shiftLower = ritzVals_host[0]; } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[0]); + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[0]); } } @@ -617,7 +619,7 @@ static int lanczosRestart( */ template int computeSmallestEigenvectors( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, @@ -633,7 +635,7 @@ int computeSmallestEigenvectors( const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // Shift for implicit restart ValueType_ shiftUpper; @@ -851,13 +853,13 @@ int computeSmallestEigenvectors( */ template int computeSmallestEigenvectors( - handle_t handle, sparse_matrix_t const &A, + handle_t const &handle, sparse_matrix_t const &A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { // Matrix dimension - IndexType_ n = A.nrows; + IndexType_ n = A.nrows_; // Check that parameters are valid RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); @@ -936,7 +938,7 @@ int computeSmallestEigenvectors( */ template int computeLargestEigenvectors( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, @@ -952,7 +954,7 @@ int computeLargestEigenvectors( const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // Lanczos iteration counters IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system @@ -1170,7 +1172,7 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors(handle_t handle, +int computeLargestEigenvectors(handle_t const &handle, sparse_matrix_t const &A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, @@ -1179,7 +1181,7 @@ int computeLargestEigenvectors(handle_t handle, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { // Matrix dimension - IndexType_ n = A.nrows; + IndexType_ n = A.nrows_; // Check that parameters are valid RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d86343990d..0dab3d57b2 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -13,12 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + #pragma once +#include -#include -#include #include +#include +#include //for now; TODO: check if/where this `define` should be; // @@ -26,388 +27,249 @@ namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status << " had an illegal value."; \ - RAFT_FAIL(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status \ + << " had an illegal value."; \ + RAFT_FAIL(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } - -extern "C" void sgeqrf_( - int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); -extern "C" void dgeqrf_( - int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - float *a, - int *lda, - const float *tau, - float *c, - int *ldc, - float *work, - int *lwork, - int *info); -extern "C" void dormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - double *a, - int *lda, - const double *tau, - double *c, - int *ldc, - double *work, - int *lwork, - int *info); -extern "C" int dgeev_(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info); - -extern "C" int sgeev_(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info); - +extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, + float *work, int *lwork, int *info); +extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, + double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, + float *a, int *lda, const float *tau, float *c, + int *ldc, float *work, int *lwork, int *info); +extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, + double *a, int *lda, const double *tau, double *c, + int *ldc, double *work, int *lwork, int *info); +extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, + double *wr, double *wi, double *vl, int *ldvl, double *vr, + int *ldvr, double *work, int *lwork, int *info); + +extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, + float *wr, float *wi, float *vl, int *ldvl, float *vr, + int *ldvr, float *work, int *lwork, int *info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *A, int lda, const float *B, int ldb, + const float *beta, float *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *A, int lda, const double *B, int ldb, + const double *beta, double *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, + int n, float *d, float *e, + float *z, int ldz, float *work, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, + int n, double *d, double *e, + double *z, int ldz, + double *work, int *info); template -class Lapack -{ -private: - Lapack(); - ~Lapack(); -public: - static void check_lapack_enabled(); - - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc); - - // special QR for lanczos - static void sterf(int n, T * d, T * e); - static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work); - - // QR - // computes the QR factorization of a general matrix - static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork); - // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); - // multiply C by implicit Q - static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void qrf (int n, T *H, T *Q, T *R); - - //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); - static void geev(T* A, T* eigenvalues, int dim, int lda); - static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); - static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); - -private: - static void lapack_gemm(const char transa, - const char transb, - int m, - int n, - int k, - float alpha, - const float *a, - int lda, - const float *b, - int ldb, - float beta, - float *c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost( - cublas_transa, cublas_transb, m, n, k, &alpha, (float *)a, lda, (float *)b, ldb, &beta, c, ldc); +class Lapack { + private: + Lapack(); + ~Lapack(); + + public: + static void check_lapack_enabled(); + + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc); + + // special QR for lanczos + static void sterf(int n, T *d, T *e); + static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + + // QR + // computes the QR factorization of a general matrix + static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. + //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q + static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void qrf (int n, T *H, T *Q, T *R); + + //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); + static void geev(T *A, T *eigenvalues, int dim, int lda); + static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr); + static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, + T *eigenvectors_i, int dim, int lda, int ldvr); + + private: + static void lapack_gemm(const char transa, const char transb, int m, int n, + int k, float alpha, const float *a, int lda, + const float *b, int ldb, float beta, float *c, + int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (float *)a, lda, (float *)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, - const signed char transb, - int m, - int n, - int k, - double alpha, - const double *a, - int lda, - const double *b, - int ldb, - double beta, - double *c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, - cublas_transb, - m, - n, - k, - &alpha, - (double *)a, - lda, - (double *)b, - ldb, - &beta, - c, - ldc); + static void lapack_gemm(const signed char transa, const signed char transb, + int m, int n, int k, double alpha, const double *a, + int lda, const double *b, int ldb, double beta, + double *c, int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (double *)a, lda, (double *)b, ldb, &beta, c, ldc); } - - static void lapack_sterf(int n, float *d, float *e, int *info) - { + static void lapack_sterf(int n, float *d, float *e, int *info) { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) - { + static void lapack_sterf(int n, double *d, double *e, int *info) { cusolverDnDsterfHost(n, d, e, info); } - static void void lapack_steqr(const signed char compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info) - { + static void lapack_steqr(const signed char compz, int n, float *d, float *e, + float *z, int ldz, float *work, int *info) { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, double *z, int ldz, double *work, int *info) - { + static void lapack_steqr(const signed char compz, int n, double *d, double *e, + double *z, int ldz, double *work, int *info) { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info) - { + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, + float *work, int *lwork, int *info) { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info) - { + + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, + double *work, int *lwork, int *info) { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - float *a, - int lda, - float *tau, - float *c, - int ldc, - float *work, - int *lwork, - int *info) - { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, + int lda, float *tau, float *c, int ldc, float *work, + int *lwork, int *info) { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - double *a, - int lda, - double *tau, - double *c, - int ldc, - double *work, - int *lwork, - int *info) - { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + + static void lapack_ormqr(char side, char trans, int m, int n, int k, + double *a, int lda, double *tau, double *c, int ldc, + double *work, int *lwork, int *info) { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - static int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info) - { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, + int *lda, double *wr, double *wi, double *vl, + int *ldvl, double *vr, int *ldvr, + double *work, int *lwork, int *info) { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } - static int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info) - { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, + int *lda, float *wr, float *wi, float *vl, + int *ldvl, float *vr, int *ldvr, float *work, + int *lwork, int *info) { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } // real eigenvalues - static - void lapack_geev(T *A, T *eigenvalues, int dim, int lda) - { + static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T *vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, - &job, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldv, - vl, - &ldv, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, + &ldv, vl, &ldv, work.data(), &work_size, &info); lapackCheckError(info); } - + // real eigenpairs - static - void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) - { + static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, + int lda, int ldvr) { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T *vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldvl, - eigenvectors, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), + vl, &ldvl, eigenvectors, &ldvr, work.data(), + &work_size, &info); lapackCheckError(info); } - + // complex eigenpairs - static - void lapack_geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) - { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, + int lda, int ldvr) { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues_r, - eigenvalues_i, - 0, - &ldvl, - eigenvectors_r, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, + eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, + work.data(), &work_size, &info); lapackCheckError(info); } - }; template -void Lapack::check_lapack_enabled() -{ +void Lapack::check_lapack_enabled() { #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) -{ +void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) -{ +void Lapack::sterf(int n, T *d, T *e) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -417,8 +279,7 @@ void Lapack::sterf(int n, T *d, T *e) } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) -{ +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -428,8 +289,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) -{ +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, + int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -438,22 +299,11 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) #endif } template -void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T *a, - int lda, - T *tau, - T *c, - int ldc, - T *work, - int *lwork) -{ +void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -463,8 +313,7 @@ void Lapack::ormqr(bool right_side, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) -{ +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -472,8 +321,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -481,19 +330,14 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, in } // complex eigenpairs template -void Lapack::geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, + dim, lda, ldvr); #endif } - -} // namespace raft + +} // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp new file mode 100644 index 0000000000..c1bc9738ae --- /dev/null +++ b/cpp/test/spectral_solvers.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +//#include +#include + +namespace raft { + +TEST(Raft, SpectralSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, lm1.diagonal_.raw()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + index_type iter; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, + reorthog, iter, eigvals, eigvecs, seed); +} + +} // namespace raft From ab4e8a66083b6c4c59d406f5a4e3062a321bcef8 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 17:18:00 -0500 Subject: [PATCH 113/189] Added largest eigenvector heart-beat test. --- cpp/test/spectral_solvers.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index c1bc9738ae..068d61e9c1 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -56,6 +56,9 @@ TEST(Raft, SpectralSolvers) { unsigned long long seed{100110021003}; computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, reorthog, iter, eigvals, eigvecs, seed); + + computeLargestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, + reorthog, iter, eigvals, eigvecs, seed); } } // namespace raft From 058aae701cbd0381f95834e91d07669e08534a91 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 18:49:55 -0500 Subject: [PATCH 114/189] Heart-beat for kmeans. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/kmeans.hpp | 59 ++++---- cpp/include/raft/spectral/sm_utils.hpp | 180 +++++++++++++++++++++---- cpp/test/spectral_solvers.cpp | 1 - cpp/test/spectral_solvers.cu | 54 ++++++++ 5 files changed, 238 insertions(+), 59 deletions(-) create mode 100644 cpp/test/spectral_solvers.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 879b214e62..d96f936a3d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,7 +207,8 @@ if(BUILD_RAFT_TESTS) test/mr/host/buffer.cpp test/test.cpp test/spectral_matrix.cpp - test/spectral_solvers.cpp) + test/spectral_solvers.cpp + test/spectral_solvers.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index f57a4c1be5..444bf2491a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,7 +114,7 @@ static __global__ void computeDistances( // Write result to global memory if (threadIdx.x == 0) - atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + utils::atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -325,7 +326,7 @@ static __global__ void divideCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(handle_t handle, +static int chooseNewCentroid(handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ rand, const ValueType_* __restrict__ obs, @@ -334,7 +335,7 @@ static int chooseNewCentroid(handle_t handle, // Cumulative sum of distances ValueType_* distsCumSum = dists + n; // Residual sum of squares - ValueType_ distsSum; + ValueType_ distsSum{0}; // Observation vector that is chosen as new centroid IndexType_ obsIndex; @@ -391,7 +392,7 @@ static int chooseNewCentroid(handle_t handle, */ template static int initializeCentroids( - handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, @@ -443,10 +444,10 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); - cudaCheckError() + CUDA_CHECK_LAST(); - // Choose remaining centroids - for (i = 1; i < k; ++i) { + // Choose remaining centroids + for (i = 1; i < k; ++i) { // Choose ith centroid if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) @@ -497,14 +498,12 @@ static int initializeCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) { +static int assignCentroids( + handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, + ValueType_* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -565,8 +564,9 @@ static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, +static int updateCentroids(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const IndexType_* __restrict__ codes, const IndexType_* __restrict__ clusterSizes, @@ -601,8 +601,8 @@ static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CUDA_CHECK_LAST(); thrust::transform(thrust_exec_policy, rows, rows + d * n, - make_constant_iterator(n), rows, - modulus()); + thrust::make_constant_iterator(n), rows, + thrust::modulus()); CUDA_CHECK_LAST(); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); @@ -612,8 +612,8 @@ static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CUDA_CHECK_LAST(); thrust::transform(thrust_exec_policy, rows, rows + d * n, - make_constant_iterator(n), rows, - divides()); + thrust::make_constant_iterator(n), rows, + thrust::divides()); CUDA_CHECK_LAST(); // Sort and reduce to add observation vectors in same cluster @@ -680,9 +680,10 @@ namespace raft { * @return error flag. */ template -int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, - const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, +int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, + IndexType_ maxiter, const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int, ValueType_* residual_host, @@ -843,11 +844,11 @@ int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, * @return error flag */ template -int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, - const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, - ValueType_& residual, IndexType_& iters, - unsigned long long seed = 123456) { +int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, + IndexType_ maxiter, const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, ValueType_& residual, + IndexType_& iters, unsigned long long seed = 123456) { using namespace matrix; // Check that parameters are valid diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp index 25d6e2e358..3c1c1e4484 100644 --- a/cpp/include/raft/spectral/sm_utils.hpp +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -102,15 +102,15 @@ static __device__ __forceinline__ double shfl(double r, int lane, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_sync(mask, a.x, lane, bound); a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl(a.x, lane, bound); a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -122,15 +122,15 @@ static __device__ __forceinline__ long long shfl(long long r, int lane, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_sync(mask, a.x, lane, bound); a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl(a.x, lane, bound); a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -170,15 +170,15 @@ static __device__ __forceinline__ double shfl_down(double r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(a.x, offset, bound); a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -190,15 +190,15 @@ static __device__ __forceinline__ long long shfl_down(long long r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(a.x, offset, bound); a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -211,15 +211,15 @@ static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(mask, a.x, offset, bound); a.y = __shfl_down(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -258,15 +258,15 @@ static __device__ __forceinline__ double shfl_up(double r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up_sync(mask, a.x, offset, bound); a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up(a.x, offset, bound); a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -278,20 +278,144 @@ static __device__ __forceinline__ long long shfl_up(long long r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up_sync(mask, a.x, offset, bound); a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up(a.x, offset, bound); a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; #endif } + +static __inline__ __device__ double atomicFPAdd(double *addr, double val) { +// atomicAdd for double starts with sm_60 +#if __CUDA_ARCH__ >= 600 + return atomicAdd(addr, val); +#else + unsigned long long old = __double_as_longlong(addr[0]), assumed; + + do { + assumed = old; + old = atomicCAS((unsigned long long *)addr, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + + return old; +#endif +} + +// atomicAdd for float starts with sm_20 +static __inline__ __device__ float atomicFPAdd(float *addr, float val) { + return atomicAdd(addr, val); +} + +static __inline__ __device__ double atomicFPMin(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(min(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMin(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(min(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPMax(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(max(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMax(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(max(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPOr(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong((bool)val | (bool)assumed))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPOr(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int((bool)val | (bool)assumed))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPLog(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(-log(exp(-val) + exp(-assumed))))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPLog(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = + int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(-logf(expf(-val) + expf(-assumed))))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + } // namespace utils } // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index 068d61e9c1..a2c329e0c8 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -19,7 +19,6 @@ #include #include -//#include #include namespace raft { diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu new file mode 100644 index 0000000000..410781369a --- /dev/null +++ b/cpp/test/spectral_solvers.cu @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, ClusterSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type maxiter{100}; + value_type tol{1.0e-10}; + index_type iter; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + + auto stream = h.get_stream(); + //thrust::cuda::par.on(stream); + + index_type n{100}; + index_type d{10}; + index_type k{5}; + index_type* codes{nullptr}; + value_type residual; + + kmeans(h, thrust::cuda::par.on(stream), n, d, k, tol, maxiter, eigvecs, codes, + residual, iter, seed); +} + +} // namespace raft From 6d9e392bf65693bc5388ea75eea0fa0aea380bbe Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Jun 2020 09:50:29 -0400 Subject: [PATCH 115/189] fix unused location_prefix in error handling macro --- cpp/include/raft/cudart_utils.h | 1 - cpp/include/raft/error.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index f9d99987f1..2ca23ba539 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -71,7 +71,6 @@ struct cuda_error : public raft::exception { * deterministic execution for debugging asynchronous CUDA execution. It should * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an * asynchronous kernel launch. - * */ #ifndef NDEBUG #define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 3801792fbc..bcca3f6f9c 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -118,7 +118,7 @@ struct logic_error : public raft::exception { #define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ do { \ char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), "RAFT failure at %s", __FILE__); \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ msg += err_msg; \ std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ __LINE__); \ From 4ebc0af11244db29c0fbcb11e93d5ec5fba18e58 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Jun 2020 09:51:13 -0400 Subject: [PATCH 116/189] remove NCCL_CHECK (replaced with NCCL_TRY) --- cpp/include/raft/comms/std_comms.hpp | 31 +++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7304f3bd4e..ddc5d8192a 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -76,9 +76,6 @@ struct nccl_error : public raft::exception { } \ } while (0); -/** FIXME: temporary alias for cuML compatibility */ -#define NCCL_CHECK(call) NCCL_TRY(call) - #define NCCL_CHECK_NO_THROW(call) \ do { \ ncclResult_t status = call; \ @@ -349,29 +346,29 @@ class std_comms : public comms_iface { void allreduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), + root, nccl_comm_, stream)); } void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { - NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), root, - nccl_comm_, stream)); + NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), get_nccl_op(op), root, + nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, datatype_t datatype, cudaStream_t stream) const { - NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, + get_nccl_datatype(datatype), nccl_comm_, stream)); } void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[], @@ -381,7 +378,7 @@ class std_comms : public comms_iface { //Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_CHECK(ncclBroadcast( + NCCL_TRY(ncclBroadcast( sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, stream)); @@ -390,9 +387,9 @@ class std_comms : public comms_iface { void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } status_t sync_stream(cudaStream_t stream) const { From 851b401383d195537061d35d7d25d03922b96d61 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Jun 2020 09:53:40 -0400 Subject: [PATCH 117/189] clang-format --- cpp/include/raft/error.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index bcca3f6f9c..0b001b01b2 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -115,16 +115,16 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ - __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ + __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** From 07a51a406263b85a248be73bebf30a92d4e727d5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Jun 2020 09:56:33 -0400 Subject: [PATCH 118/189] another clang format --- cpp/include/raft/comms/std_comms.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index ddc5d8192a..3528c148df 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -353,16 +353,15 @@ class std_comms : public comms_iface { void bcast(void *buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); } void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), root, - nccl_comm_, stream)); + NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), + get_nccl_op(op), root, nccl_comm_, stream)); } void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, From 610348943e5094bd4d1aafe91faa2f6e1b452db4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 14:34:30 -0500 Subject: [PATCH 119/189] Adding higher level solvers to test: cluster solver interface. --- cpp/include/raft/spectral/cluster_solvers.hpp | 8 ++++---- cpp/test/spectral_solvers.cu | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index cd0963506f..0b8999c6a0 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -38,11 +38,11 @@ struct cluster_solver_config_t { template struct kmeans_solver_t { - explicit kmeans_solver_t( - cluster_solver_config_t const& config) + explicit kmeans_solver_t(cluster_solver_config_t const& config) : config_(config) {} - template + template std::pair solve( handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, @@ -58,6 +58,6 @@ struct kmeans_solver_t { auto const& get_config(void) const { return config_; } private: - cluster_solver_config_t config_; + cluster_solver_config_t config_; }; } // namespace raft diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu index 410781369a..971950410c 100644 --- a/cpp/test/spectral_solvers.cu +++ b/cpp/test/spectral_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { From 7be516eb57a5c3e24035d8794a2d420416cdcc85 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 14:50:16 -0500 Subject: [PATCH 120/189] Adding higher level solvers to test: eigen solver interface. --- cpp/include/raft/spectral/eigen_solvers.hpp | 12 +++++++----- cpp/test/spectral_solvers.cpp | 14 +++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 9c1258c432..21d2f83dbb 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -40,12 +40,13 @@ struct eigen_solver_config_t { template struct lanczos_solver_t { - explicit lanczos_solver_t( - eigen_solver_config_t const& config) + explicit lanczos_solver_t(eigen_solver_config_t const& config) : config_(config) {} index_type_t solve_smallest_eigenvectors( - handle_t handle, sparse_matrix_t const& A, + handle_t const& handle, + sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; @@ -57,7 +58,8 @@ struct lanczos_solver_t { } index_type_t solve_largest_eigenvectors( - handle_t handle, sparse_matrix_t const& A, + handle_t const& handle, + sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; @@ -71,6 +73,6 @@ struct lanczos_solver_t { auto const& get_config(void) const { return config_; } private: - eigen_solver_config_t config_; + eigen_solver_config_t config_; }; } // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index a2c329e0c8..549c2e66d7 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -53,11 +53,15 @@ TEST(Raft, SpectralSolvers) { value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; - computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, - reorthog, iter, eigvals, eigvecs, seed); - computeLargestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, - reorthog, iter, eigvals, eigvecs, seed); + eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + + lanczos_solver_t eig_solver{cfg}; + + eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs); + + eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); } } // namespace raft From 9dee0b370bf7d3487507f0f6c5f6bc2af8e176a7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 15:02:32 -0500 Subject: [PATCH 121/189] Adding higher level solvers to test: cluster solver interface (fixed). --- cpp/include/raft/spectral/cluster_solvers.hpp | 5 +++-- cpp/test/spectral_solvers.cu | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 0b8999c6a0..08579e22bd 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -44,8 +44,9 @@ struct kmeans_solver_t { template std::pair solve( - handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, - size_type_t dim, value_type_t const* __restrict__ obs, + handle_t const& handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { value_type_t residual{}; index_type_t iters{}; diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu index 971950410c..8c0d94b9e5 100644 --- a/cpp/test/spectral_solvers.cu +++ b/cpp/test/spectral_solvers.cu @@ -34,21 +34,22 @@ TEST(Raft, ClusterSolvers) { index_type maxiter{100}; value_type tol{1.0e-10}; - index_type iter; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; auto stream = h.get_stream(); - //thrust::cuda::par.on(stream); index_type n{100}; index_type d{10}; index_type k{5}; index_type* codes{nullptr}; - value_type residual; - kmeans(h, thrust::cuda::par.on(stream), n, d, k, tol, maxiter, eigvecs, codes, - residual, iter, seed); + cluster_solver_config_t cfg{k, maxiter, tol, seed}; + + kmeans_solver_t cluster_solver{cfg}; + + auto pair_ret = + cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes); } } // namespace raft From 1d4c05daf19c696275a3cbfb9819f25c4eac31d7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 16:59:50 -0500 Subject: [PATCH 122/189] Higher level API. --- cpp/CMakeLists.txt | 4 ++-- .../raft/spectral/modularity_maximization.hpp | 2 +- cpp/include/raft/spectral/partition.hpp | 12 +++++++----- cpp/include/raft/spectral/spectral_util.hpp | 11 ++++++++--- cpp/test/{spectral_solvers.cu => cluster_solvers.cu} | 0 cpp/test/{spectral_solvers.cpp => eigen_solvers.cu} | 3 +-- 6 files changed, 19 insertions(+), 13 deletions(-) rename cpp/test/{spectral_solvers.cu => cluster_solvers.cu} (100%) rename cpp/test/{spectral_solvers.cpp => eigen_solvers.cu} (96%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d96f936a3d..a9c0375de6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,8 +207,8 @@ if(BUILD_RAFT_TESTS) test/mr/host/buffer.cpp test/test.cpp test/spectral_matrix.cpp - test/spectral_solvers.cpp - test/spectral_solvers.cu) + test/eigen_solvers.cu + test/cluster_solvers.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 1e387c0606..0480287936 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -191,7 +191,7 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 7c886ff282..98ca84fbb9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -67,7 +67,7 @@ template , typename ClusterSolver = kmeans_solver_t> std::tuple partition( - handle_t handle, ThrustExePolicy thrust_exec_policy, + handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { @@ -94,7 +94,7 @@ std::tuple partition( laplacian_matrix_t L{handle, graph}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_configs.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = @@ -131,8 +131,10 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template -void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, +template +void analyzePartition(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { @@ -163,7 +165,7 @@ void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index a0c10284e3..7789247445 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -122,6 +122,9 @@ void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + const weight_t zero{0.0}; + const weight_t one{1.0}; + // Whiten eigenvector matrix for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; @@ -190,9 +193,11 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // -template +template bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, weight_t& clustersize, weight_t& partStats, + edge_t index, edge_t n, weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { @@ -206,7 +211,7 @@ bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); + equal_to_i_op(index)); CUDA_CHECK_LAST(); // Compute size of ith partition diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/cluster_solvers.cu similarity index 100% rename from cpp/test/spectral_solvers.cu rename to cpp/test/cluster_solvers.cu diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/eigen_solvers.cu similarity index 96% rename from cpp/test/spectral_solvers.cpp rename to cpp/test/eigen_solvers.cu index 549c2e66d7..506635ec46 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/eigen_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -49,7 +49,6 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - index_type iter; value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; From 4480dd33e391466fc1002cc26706f2e6dbe6c1fa Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 17:47:22 -0500 Subject: [PATCH 123/189] Fixes in higher level API. --- cpp/include/raft/spectral/partition.hpp | 11 ++---- cpp/include/raft/spectral/spectral_util.hpp | 11 +++--- cpp/test/eigen_solvers.cu | 40 ++++++++++++++++++++- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 98ca84fbb9..2b6c54f49a 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -63,24 +63,19 @@ using namespace linalg; * @return statistics: number of eigensolver iterations, . */ template , - typename ClusterSolver = kmeans_solver_t> + typename ThrustExePolicy, typename EigenSolver, + typename ClusterSolver> std::tuple partition( handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - const weight_t zero{0.0}; - const weight_t one{1.0}; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t i; edge_t n = graph.number_of_vertices; // ------------------------------------------------------- @@ -98,7 +93,7 @@ std::tuple partition( // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(L, eigVals, eigVecs); + eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 7789247445..2cc38cbbf1 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -117,8 +117,9 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { template -void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { +void transform_eigen_matrix(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t n, + vertex_t nEigVecs, weight_t* eigVecs) { auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -195,9 +196,9 @@ struct equal_to_i_op { // template -bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t index, edge_t n, weight_t& clustersize, - weight_t& partStats, +bool construct_indicator(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t index, + edge_t n, weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 506635ec46..604afdc76a 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -23,7 +23,7 @@ namespace raft { -TEST(Raft, SpectralSolvers) { +TEST(Raft, EigenSolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -63,4 +63,42 @@ TEST(Raft, SpectralSolvers) { eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); } +TEST(Raft, SpectralSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + GraphCSRView empty_graph; + auto t_exe_p = thrust::cuda::par.on(stream); + auto tuple_ret = + spectral::partition(h, t_exe_p, empty_graph, eig_solver, cluster_solver, + clusters, eigvals, eigvecs); +} + } // namespace raft From 282c4c9f034224961b7603da29c3800842ebce99 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 19:46:09 -0500 Subject: [PATCH 124/189] Added / Fixed higher level modularity API and test. --- cpp/include/raft/spectral/cluster_solvers.hpp | 2 + cpp/include/raft/spectral/eigen_solvers.hpp | 4 ++ cpp/include/raft/spectral/error_temp.hpp | 10 +++- .../raft/spectral/modularity_maximization.hpp | 42 +++++--------- cpp/include/raft/spectral/partition.hpp | 6 ++ cpp/test/cluster_solvers.cu | 57 +++++++++++++++++-- cpp/test/eigen_solvers.cu | 23 ++++++-- 7 files changed, 104 insertions(+), 40 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 08579e22bd..b19237d1a8 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -48,6 +48,8 @@ struct kmeans_solver_t { size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { + RAFT_EXPECT(obs != nullptr, "Null obs buffer."); + RAFT_EXPECT(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 21d2f83dbb..97114661c5 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -49,6 +49,8 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; RAFT_TRY(computeSmallestEigenvectors( handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -62,6 +64,8 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 82beb75640..3fa5a38f5f 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -1,10 +1,15 @@ #pragma once +#include +#include + #define STRINGIFY_DETAIL(x) #x #define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) - -#define RAFT_EXPECT(cond, reason) +///#define RAFT_EXPECT(cond, reason) +inline void RAFT_EXPECT(bool cond, std::string const& reason) { + if (!cond) throw std::runtime_error(reason.c_str()); +} #define RAFT_TRY(error_expression) @@ -28,4 +33,3 @@ #else // DEBUG #define WARNING(message) #endif - diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 0480287936..e406772666 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -83,16 +83,16 @@ using namespace linalg; * @return error flag. */ template , - typename ClusterSolver = kmeans_solver_t> + typename ThrustExePolicy, typename EigenSolver, + typename ClusterSolver> std::tuple modularity_maximization( - handle_t handle, ThrustExePolicy thrust_exec_policy, + handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - const weight_t zero{0.0}; - const weight_t one{1.0}; + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -109,11 +109,11 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, graph}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_configs.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(B, eigVals, eigVecs); + eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); @@ -136,24 +136,6 @@ std::tuple modularity_maximization( // Analysis of graph partition // ========================================================= -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - /// Compute modularity /** This function determines the modularity based on a graph and cluster assignments * @param G Weighted graph in CSR format @@ -161,12 +143,16 @@ struct equal_to_i_op { * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template -void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, +template +void analyzeModularity(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + edge_t i; edge_t n = graph.number_of_vertices; weight_t partModularity, clustersize; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 2b6c54f49a..1b768ca4c4 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -70,6 +70,10 @@ std::tuple partition( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -133,6 +137,8 @@ void analyzePartition(handle_t const &handle, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 8c0d94b9e5..d3d6a04312 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -34,7 +34,6 @@ TEST(Raft, ClusterSolvers) { index_type maxiter{100}; value_type tol{1.0e-10}; - value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; auto stream = h.get_stream(); @@ -42,14 +41,64 @@ TEST(Raft, ClusterSolvers) { index_type n{100}; index_type d{10}; index_type k{5}; + + //nullptr expected to trigger exceptions: + // + value_type* eigvecs{nullptr}; index_type* codes{nullptr}; cluster_solver_config_t cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{cfg}; - auto pair_ret = - cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes); + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, + eigvecs, codes)); +} + +TEST(Raft, ModularitySolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + //nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + GraphCSRView empty_graph; + auto t_exe_p = thrust::cuda::par.on(stream); + + EXPECT_ANY_THROW(spectral::modularity_maximization( + h, t_exe_p, empty_graph, eig_solver, cluster_solver, clusters, eigvals, + eigvecs)); + + value_type modularity{0}; + EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, empty_graph, k, + clusters, modularity)); } } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 604afdc76a..d31aba896e 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -49,6 +49,8 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; + //nullptr expected to trigger exceptions: + // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; @@ -58,9 +60,11 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs); + EXPECT_ANY_THROW( + eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs)); - eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); + EXPECT_ANY_THROW( + eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs)); } TEST(Raft, SpectralSolvers) { @@ -78,9 +82,12 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; + //nullptr expected to trigger exceptions: + // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; eigen_solver_config_t eig_cfg{ @@ -96,9 +103,15 @@ TEST(Raft, SpectralSolvers) { auto stream = h.get_stream(); GraphCSRView empty_graph; auto t_exe_p = thrust::cuda::par.on(stream); - auto tuple_ret = - spectral::partition(h, t_exe_p, empty_graph, eig_solver, cluster_solver, - clusters, eigvals, eigvecs); + + EXPECT_ANY_THROW(spectral::partition(h, t_exe_p, empty_graph, eig_solver, + cluster_solver, clusters, eigvals, + eigvecs)); + + value_type edgeCut{0}; + value_type cost{0}; + EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, empty_graph, k, + clusters, edgeCut, cost)); } } // namespace raft From b67755cdeb92fda36797121c2d9efd0e6c3ec857 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 16 Jun 2020 20:06:53 -0700 Subject: [PATCH 125/189] TEST test for get_internal_streams --- cpp/test/handle.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 2c5280199d..5f6f3ceece 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -43,4 +43,10 @@ TEST(Raft, Handle) { CUDA_CHECK(cudaStreamDestroy(stream)); } +TEST(Raft, GetInternalStreams) { + handle_t h(4); + auto streams = h.get_internal_streams(); + ASSERT_EQ(4U, streams.size()); +} + } // namespace raft From 1367c11d22120fa895247df2a771560c78ce4f57 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 16 Jun 2020 20:07:12 -0700 Subject: [PATCH 126/189] FIX bug in handle_t::get_internal_streams --- cpp/include/raft/handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 01d6d7a83f..9af2e916bd 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -128,7 +128,7 @@ class handle_t { cudaStream_t get_internal_stream(int sid) const { return streams_[sid]; } int get_num_internal_streams() const { return num_streams_; } std::vector get_internal_streams() const { - std::vector int_streams_vec(num_streams_); + std::vector int_streams_vec; for (auto s : streams_) { int_streams_vec.push_back(s); } From dbaec944b4bb07fd911d0d7a7f4a2d3b109aa9dc Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 16 Jun 2020 20:53:36 -0700 Subject: [PATCH 127/189] DOC update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d447381e9..09164d0706 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error +- PR #25: Fix bug in handle_t::get_internal_streams # RAFT 0.14.0 (Date TBD) From 6211f8d778150dccd2ae1cdaf4303b2be5a8e837 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 17 Jun 2020 17:53:36 -0500 Subject: [PATCH 128/189] Addressed CUDA 11 API changes in cusparse (csrmv, csrmm). --- cpp/include/raft/sparse/cusparse_wrappers.h | 57 +++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 865f93843d..56c3fc8dbc 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -176,6 +176,33 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, cscColPtrB, cscRowIndB, beta, C, ldc); } /** @} */ + +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup Csrmv cusparse SpMV operations + * @{ + */ +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, + cusparseSpMVAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + computeType, alg, bufferSize); +} + +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, + cusparseSpMVAlg_t alg, void* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, computeType, + alg, externalBuffer); +} +/** @} */ +#else /** * @defgroup Csrmv cusparse csrmv operations * @{ @@ -207,7 +234,36 @@ inline cusparseStatus_t cusparsecsrmv( csrRowPtr, csrColInd, x, beta, y); } /** @} */ +#endif + +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const void* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, + cudaDataType computeType, cusparseSpMMAlg_t alg, size_t* bufferSize, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, computeType, alg, bufferSize); +} +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const void* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, + cudaDataType computeType, cusparseSpMMAlg_t alg, void* externalBuffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + computeType, alg, externalBuffer); +} +/** @} */ +#else /** * @defgroup Csrmm cusparse csrmm operations * @{ @@ -241,6 +297,7 @@ inline cusparseStatus_t cusparsecsrmm( csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ +#endif /** * @defgroup csr2coo cusparse CSR to COO converter methods From 4097a72536e1b75b89c36467f9125b26727ff9ab Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 17 Jun 2020 19:19:56 -0500 Subject: [PATCH 129/189] Fixes in CUDA 11 cusparse interface. --- cpp/include/raft/sparse/cusparse_wrappers.h | 52 +++++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 56c3fc8dbc..97a7823d86 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -192,13 +192,31 @@ inline cusparseStatus_t cusparsespmv_buffersize( computeType, alg, bufferSize); } +template +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, T* externalBuffer, + cudaStream_t stream); +template <> inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, - cusparseSpMVAlg_t alg, void* externalBuffer, cudaStream_t stream) { + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, + alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + double* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, computeType, + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ @@ -251,16 +269,32 @@ inline cusparseStatus_t cusparsespmm_bufferSize( return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, bufferSize); } - +template inline cusparseStatus_t cusparsespmm( cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const void* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, - cudaDataType computeType, cusparseSpMMAlg_t alg, void* externalBuffer, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_32F, alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - computeType, alg, externalBuffer); + CUDA_R_64F, alg, externalBuffer); } /** @} */ #else From c4130c99b09a268ef5181e3b08ffa017e123a8c4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 10:51:03 -0500 Subject: [PATCH 130/189] Added raft handle to sparse_matrix cnstr. necessary for mv() memf calls of cusparse and allocation. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 38 +++++++++++-------- .../raft/spectral/modularity_maximization.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 4 +- cpp/test/eigen_solvers.cu | 2 +- cpp/test/spectral_matrix.cpp | 4 +- 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 74dbd38be6..232d6a5cc6 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -86,18 +86,21 @@ class vector_t { template struct sparse_matrix_t { - sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, - value_type const* values, index_type const nrows, - index_type const nnz) - : row_offsets_(row_offsets), + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : handle_(raft_handle), + row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), nnz_(nnz) {} sparse_matrix_t( + handle_t const& raft_handle, GraphCSRView const& csr_view) - : row_offsets_(csr_view.offsets), + : handle_(raft_handle), + row_offsets_(csr_view.offsets), col_indices_(csr_view.indices), values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), @@ -109,7 +112,8 @@ struct sparse_matrix_t { // y = alpha*A*x + beta*y // virtual void mv(value_type alpha, value_type const* __restrict__ x, - value_type beta, value_type* __restrict__ y) const { + value_type beta, value_type* __restrict__ y, + bool transpose = false, bool symmetric = false) const { //TODO: // //Cusparse::set_pointer_mode_host(); @@ -118,10 +122,10 @@ struct sparse_matrix_t { //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + handle_t const& handle_; index_type const* row_offsets_; index_type const* col_indices_; - value_type const* - values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + value_type const* values_; index_type const nrows_; index_type const nnz_; }; @@ -131,8 +135,8 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) - : sparse_matrix_t(row_offsets, col_indices, values, - nrows, nnz), + : sparse_matrix_t(raft_handle, row_offsets, + col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); //TODO: more work, here: @@ -145,7 +149,7 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) - : sparse_matrix_t(csr_view), + : sparse_matrix_t(raft_handle, csr_view), diagonal_(raft_handle, csr_view.number_of_vertices) { //TODO: more work, here: // @@ -157,7 +161,8 @@ struct laplacian_matrix_t : sparse_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y) const override { + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // // if (beta == 0) @@ -213,7 +218,8 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y) const override { + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // // // y = A*x @@ -225,9 +231,11 @@ struct modularity_matrix_t : laplacian_matrix_t { // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } - value_type get_diag_nrm1(void) const { return diag_nrm1_; } + value_type get_diag_nrm1(void) const { + return diag_nrm1_; // TODO: replace w/ diag_.nrm1() + } - value_type diag_nrm1_; + value_type diag_nrm1_; // TODO: remove }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index e406772666..6ab1b16659 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -105,7 +105,7 @@ std::tuple modularity_maximization( // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, graph}; auto eigen_config = eigen_solver.get_config(); @@ -169,7 +169,7 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, graph}; // Initialize output diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 1b768ca4c4..6cc2744e96 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -89,7 +89,7 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, graph}; auto eigen_config = eigen_solver.get_config(); @@ -156,7 +156,7 @@ void analyzePartition(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, graph}; // Initialize output diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index d31aba896e..87bf74bdde 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -37,7 +37,7 @@ TEST(Raft, EigenSolvers) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; - sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp index 46d753550f..30346d5da5 100644 --- a/cpp/test/spectral_matrix.cpp +++ b/cpp/test/spectral_matrix.cpp @@ -42,8 +42,8 @@ TEST(Raft, SpectralMatrices) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; - sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; - sparse_matrix_t sm2{empty_graph}; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{h, empty_graph}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); From 87c49845f692d167e333cb481358d7770e4d0edd Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 12:02:21 -0500 Subject: [PATCH 131/189] Sparse MV forking: pre-CUDA 11 step. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 232d6a5cc6..8f9eab64bc 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -114,10 +114,32 @@ struct sparse_matrix_t { virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const { - //TODO: - // - //Cusparse::set_pointer_mode_host(); - //cusparsecsrmv(...); + + using namespace sparse; + + auto cusparse_h = handle_.get_cusparse_handle(); + auto stream = handle_.get_stream(); +#if __CUDACC_VER_MAJOR__ > 10 +#else + CUSPARSE_CHECK( + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;//non-transpose + cusparseMatDescr_t descr = 0; + CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); + if (symmetric) { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); + } else { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + } + CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, + &alpha, descr, values_, + row_offsets_, col_indices_, + x, &beta, y, stream)); + CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); +#endif } //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate From c5f5ab58e712aeb9530586fda80399ef2e17e3f8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Jun 2020 14:23:44 -0400 Subject: [PATCH 132/189] add missing parenthesis --- cpp/include/raft/error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 0b001b01b2..4955556eed 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -137,7 +137,7 @@ struct logic_error : public raft::exception { */ #define RAFT_EXPECTS(cond, fmt, ...) \ do { \ - if (!cond) { \ + if (!(cond)) { \ std::string msg{}; \ SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \ throw raft::logic_error(msg); \ From 105f6268f9c41c12f30a8c7483caafaa53b0b79b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Jun 2020 14:27:46 -0400 Subject: [PATCH 133/189] add change log --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09164d0706..58477e868f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error - PR #25: Fix bug in handle_t::get_internal_streams +- PR #26: Fix bug in RAFT_EXPECTS (add parentheses surrounding cond) # RAFT 0.14.0 (Date TBD) From 2225214fcbd2efffa13d80403cb16a4428998770 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 17:31:55 -0500 Subject: [PATCH 134/189] Added CUDA 11 path for SpMV calls. Step 2. --- cpp/include/raft/sparse/cusparse_wrappers.h | 123 ++++++++++++++++-- cpp/include/raft/spectral/matrix_wrappers.hpp | 55 ++++++-- 2 files changed, 161 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 97a7823d86..347cd7fa59 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -178,18 +178,108 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, /** @} */ #if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup cusparse Create CSR operations + * @{ + */ +template +cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, int64_t nnz, + IndexT* csrRowOffsets, IndexT* csrColInd, + ValueT* csrValues); +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +/** @} */ +/** + * @defgroup cusparse CreateDnVec operations + * @{ + */ +template +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, T* values); +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, float* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, double* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); +} +/** @} */ + /** * @defgroup Csrmv cusparse SpMV operations * @{ */ +template +cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream); +template <> inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, - cusparseSpMVAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - computeType, alg, bufferSize); + CUDA_R_32F, alg, bufferSize); +} +template <> +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_64F, alg, bufferSize); } template @@ -259,15 +349,32 @@ inline cusparseStatus_t cusparsecsrmv( * @defgroup Csrmm cusparse csrmm operations * @{ */ +template +cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_32F, alg, bufferSize); +} +template <> inline cusparseStatus_t cusparsespmm_bufferSize( cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const void* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, - cudaDataType computeType, cusparseSpMMAlg_t alg, size_t* bufferSize, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, computeType, alg, bufferSize); + matC, CUDA_R_64F, alg, bufferSize); } template inline cusparseStatus_t cusparsespmm( diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 8f9eab64bc..e7bb8c9d52 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -114,18 +114,56 @@ struct sparse_matrix_t { virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const { - using namespace sparse; - + auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); + + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + #if __CUDACC_VER_MAJOR__ > 10 + + //create descriptors: + // + cusparseSpMatDescr_t matA; + CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_, + col_indices_, values_)); + + cusparseDnVecDescr_t vecX; + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, + x)); // TODO: const-cast down?! + + cusparseDnVecDescr_t vecY; + CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); + + //get (scratch) external device buffer size: + // + size_t bufferSize; + CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, opA, &alpha, matA, vecX, + &beta, vecY, alg, &bufferSize, + stream)); + + //allocate external buffer: + // + vector_t external_buffer(handle_, bufferSize); + + //finally perform SpMV: + // + CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, + vecY, CUSPARSE_CSRMV_ALG1, + external_buffer.raw(), stream)); + + //free descriptors: + //(TODO: maybe wrap them in a RAII struct?) + // + CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); + CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); + CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); - - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;//non-transpose + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -135,9 +173,8 @@ struct sparse_matrix_t { } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, - &alpha, descr, values_, - row_offsets_, col_indices_, - x, &beta, y, stream)); + &alpha, descr, values_, row_offsets_, + col_indices_, x, &beta, y, stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } From 7415edaf9bc27772b11d429fd48a1031a448eb78 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 17:44:53 -0500 Subject: [PATCH 135/189] Removed constness of some parameters in SpMV, because CUDA 11 requires it. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index e7bb8c9d52..0c55ef7294 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -110,10 +110,13 @@ struct sparse_matrix_t { default; // virtual because used as base for following matrix types // y = alpha*A*x + beta*y + //(Note: removed const-ness of x, because CUDA 11 SpMV + // descriptor creation works with non-const, and const-casting + // down is dangerous) // - virtual void mv(value_type alpha, value_type const* __restrict__ x, - value_type beta, value_type* __restrict__ y, - bool transpose = false, bool symmetric = false) const { + virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const { using namespace sparse; auto cusparse_h = handle_.get_cusparse_handle(); @@ -132,8 +135,7 @@ struct sparse_matrix_t { col_indices_, values_)); cusparseDnVecDescr_t vecX; - CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, - x)); // TODO: const-cast down?! + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x)); cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); @@ -219,7 +221,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: @@ -276,7 +278,7 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: From 6114c0f8f899d5259ff77712f1f1f3ff7a78e549 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 20:30:50 -0500 Subject: [PATCH 136/189] Added some pre-conditions in mv(). --- cpp/include/raft/spectral/matrix_wrappers.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 0c55ef7294..f2a64d3c17 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -119,6 +119,9 @@ struct sparse_matrix_t { bool symmetric = false) const { using namespace sparse; + RAFT_EXPECT(x != nullptr, "Null x buffer."); + RAFT_EXPECT(y != nullptr, "Null y buffer."); + auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); From 761cacde85ae0957997ad8898613ef75d558a38d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 09:20:12 -0500 Subject: [PATCH 137/189] Fixed curand dependencies. --- cpp/CMakeLists.txt | 1 + cpp/include/raft/spectral/error_temp.hpp | 4 ++-- cpp/include/raft/spectral/lanczos.hpp | 26 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a9c0375de6..03b0222e97 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -185,6 +185,7 @@ set(RAFT_LINK_LIBRARIES ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_cusparse_LIBRARY} + ${CUDA_curand_LIBRARY} rmm) set(RAFT_LINK_DIRECTORIES "") diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 3fa5a38f5f..7d525ae5f1 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -11,13 +11,13 @@ inline void RAFT_EXPECT(bool cond, std::string const& reason) { if (!cond) throw std::runtime_error(reason.c_str()); } -#define RAFT_TRY(error_expression) +#define RAFT_TRY(expression) (expression) //assume RAFT_FAIL() can take a std::string `reason` // #define RAFT_FAIL(reason) -#define CUDA_TRY(call) +#define CUDA_TRY(call) (call) #define CUDA_CHECK_LAST() diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index e9682f5c28..c15b7ade0d 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -36,7 +36,19 @@ namespace raft { using namespace matrix; using namespace linalg; -namespace { +namespace detail { + +// curandGeneratorNormalX +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + float *outputPtr, size_t n, + float mean, float stddev) { + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + double *outputPtr, size_t n, + double mean, double stddev) { + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} // ========================================================= // Helper functions @@ -565,7 +577,7 @@ static int lanczosRestart( return 0; } -} // namespace +} // namespace detail // ========================================================= // Eigensolver @@ -626,9 +638,7 @@ int computeSmallestEigenvectors( ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- + using namespace detail; // Useful constants const ValueType_ one = 1; @@ -858,6 +868,8 @@ int computeSmallestEigenvectors( ValueType_ tol, bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { + using namespace detail; + // Matrix dimension IndexType_ n = A.nrows_; @@ -945,9 +957,7 @@ int computeLargestEigenvectors( ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- + using namespace detail; // Useful constants const ValueType_ one = 1; From ca240beefbcc9150fc464012d914a88d530c3aba Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 11:33:32 -0500 Subject: [PATCH 138/189] More CUDA 11 debt. --- cpp/CMakeLists.txt | 2 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 1 + cpp/include/raft/spectral/sm_utils.hpp | 14 ++++++++++++++ .../{spectral_matrix.cpp => spectral_matrix.cu} | 0 4 files changed, 16 insertions(+), 1 deletion(-) rename cpp/test/{spectral_matrix.cpp => spectral_matrix.cu} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 03b0222e97..928161ffbd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,7 +207,7 @@ if(BUILD_RAFT_TESTS) test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/test.cpp - test/spectral_matrix.cpp + test/spectral_matrix.cu test/eigen_solvers.cu test/cluster_solvers.cu) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index f2a64d3c17..4ff0bbf9dd 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,6 +19,7 @@ #include #include #include +#include // ========================================================= // Useful macros diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp index 3c1c1e4484..34eeec16bd 100644 --- a/cpp/include/raft/spectral/sm_utils.hpp +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -416,6 +416,20 @@ static __inline__ __device__ float atomicFPLog(float *addr, float val) { return old; } +// Apply diagonal matrix to vector: +// +template +static __global__ void diagmv(IndexType_ n, ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ *__restrict__ y) { + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + y[i] += alpha * D[i] * x[i]; + i += blockDim.x * gridDim.x; + } +} + } // namespace utils } // namespace raft diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cu similarity index 100% rename from cpp/test/spectral_matrix.cpp rename to cpp/test/spectral_matrix.cu From b40b70267644cc68591e7a64084978f58ea0d5e6 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 16:01:53 -0500 Subject: [PATCH 139/189] Added correct version of L1 norm. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 4ff0bbf9dd..612b8ef65e 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -21,6 +21,8 @@ #include #include +#include + // ========================================================= // Useful macros // ========================================================= @@ -83,6 +85,16 @@ class vector_t { size_type size(void) const { return size_; } value_type* raw(void) { return buffer_; } + + template + value_type nrm1(ThrustExecPolicy t_exe_pol) const { + return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); + } }; template From 2dad70dd4be2fb750f08e3b8f94c21388ed00ecc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 19:27:03 -0500 Subject: [PATCH 140/189] Fixes for matrix wrappers and tests. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 61 +++++++++---------- .../raft/spectral/modularity_maximization.hpp | 6 +- cpp/include/raft/spectral/partition.hpp | 4 +- cpp/test/eigen_solvers.cu | 10 +-- cpp/test/spectral_matrix.cu | 33 +++++++--- 5 files changed, 64 insertions(+), 50 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 612b8ef65e..e1eaf237c3 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -21,6 +21,7 @@ #include #include +#include #include // ========================================================= @@ -95,6 +96,11 @@ class vector_t { return abs_left + abs_right; }); } + + template + void fill(ThrustExecPolicy t_exe_pol, value_type value) { + thrust::fill_n(t_exe_pol, buffer_, size_, value); + } }; template @@ -209,30 +215,31 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + template + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : sparse_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { - auto* v = diagonal_.raw(); - //TODO: more work, here: - // - // vector_t ones(nrows); - // ones.fill(1.0); - // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + vector_t ones{raft_handle, nrows}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } + template laplacian_matrix_t( - handle_t const& raft_handle, + handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) : sparse_matrix_t(raft_handle, csr_view), diagonal_(raft_handle, csr_view.number_of_vertices) { - //TODO: more work, here: - // - // vector_t ones(csr_view.number_of_vertices_); - // ones.fill(1.0); - // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + vector_t ones{raft_handle, csr_view.number_of_vertices}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } // y = alpha*A*x + beta*y @@ -242,6 +249,8 @@ struct laplacian_matrix_t : sparse_matrix_t { bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // + // // scales y by beta: + // // // if (beta == 0) // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) // else if (beta != 1) @@ -271,26 +280,22 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { + template modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz) { - auto* v = laplacian_matrix_t::diagonal_.raw(); - //TODO: more work, here: - // - // diag_nrm1_ = diagonal_.nrm1(); - } + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, + nrows, nnz) {} + template modularity_matrix_t( - handle_t const& raft_handle, + handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) - : laplacian_matrix_t(raft_handle, csr_view) { - //TODO: more work, here: - // - // diag_nrm1_ = diagonal_.nrm1(); - } + : laplacian_matrix_t( + raft_handle, thrust_exec_policy, csr_view) {} // y = alpha*A*x + beta*y // @@ -307,12 +312,6 @@ struct modularity_matrix_t : laplacian_matrix_t { // // y = y -(gamma/edge_sum)*d // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } - - value_type get_diag_nrm1(void) const { - return diag_nrm1_; // TODO: replace w/ diag_.nrm1() - } - - value_type diag_nrm1_; // TODO: remove }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 6ab1b16659..8e198e515f 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -106,7 +106,7 @@ std::tuple modularity_maximization( // Initialize Modularity Matrix sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, graph}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -170,7 +170,7 @@ void analyzeModularity(handle_t const &handle, // Initialize Modularity sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, graph}; // Initialize output modularity = 0; @@ -189,7 +189,7 @@ void analyzeModularity(handle_t const &handle, } // modularity = modularity/nClusters; // devide by nnz - modularity = modularity / B.get_diag_nrm1(); + modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 6cc2744e96..746bb54b60 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -90,7 +90,7 @@ std::tuple partition( // Initialize Laplacian sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, graph}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -157,7 +157,7 @@ void analyzePartition(handle_t const &handle, // Initialize Laplacian sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, graph}; // Initialize output cost = 0; diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 87bf74bdde..b96bda0b34 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -37,12 +37,12 @@ TEST(Raft, EigenSolvers) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); - laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; - ASSERT_EQ(nullptr, lm1.diagonal_.raw()); - index_type neigvs{10}; index_type maxiter{100}; index_type restart_iter{10}; @@ -61,10 +61,10 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs)); + eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs)); + eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } TEST(Raft, SpectralSolvers) { diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 30346d5da5..1052911c4f 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -47,15 +47,30 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); - laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; - laplacian_matrix_t lm2{h, empty_graph}; - ASSERT_EQ(nullptr, lm1.diagonal_.raw()); - ASSERT_EQ(nullptr, lm2.diagonal_.raw()); - - modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; - modularity_matrix_t mm2{h, empty_graph}; - ASSERT_EQ(nullptr, mm1.diagonal_.raw()); - ASSERT_EQ(nullptr, mm2.diagonal_.raw()); + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + + auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args + + auto cnstr_lm2 = [&h, t_exe_pol, &empty_graph](void) { + laplacian_matrix_t lm2{h, t_exe_pol, empty_graph}; + }; + EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args + + auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args + + auto cnstr_mm2 = [&h, t_exe_pol, &empty_graph](void) { + modularity_matrix_t mm2{h, t_exe_pol, empty_graph}; + }; + EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } } // namespace raft From 3f5ec592a551b3311eb3ed87f0f783cabfe08d80 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 20:50:51 -0500 Subject: [PATCH 141/189] Fixed mv() for lapalacian matrix. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 63 +++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index e1eaf237c3..286fe5e6d2 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + // ========================================================= // Useful macros // ========================================================= @@ -87,6 +90,8 @@ class vector_t { value_type* raw(void) { return buffer_; } + value_type const* raw(void) const { return buffer_; } + template value_type nrm1(ThrustExecPolicy t_exe_pol) const { return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, @@ -203,6 +208,8 @@ struct sparse_matrix_t { #endif } + handle_t const& get_handle(void) const { return handle_; } + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate handle_t const& handle_; @@ -247,32 +254,38 @@ struct laplacian_matrix_t : sparse_matrix_t { void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { - //TODO: call cusparse::csrmv ... and more: + constexpr int BLOCK_SIZE = 1024; + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // scales y by beta: + // + if (beta == 0) { + CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); + } else if (beta != 1) { + CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + } + + // Apply diagonal matrix + // + dim3 gridDim, blockDim; + gridDim.x = std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + utils::diagmv<<>>(n, alpha, diagonal_.raw(), + x, y); + CUDA_CHECK_LAST(); + + // Apply adjacency matrix // - // // scales y by beta: - // // - // if (beta == 0) - // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) - // else if (beta != 1) - // thrust::transform(thrust::device_pointer_cast(y), - // thrust::device_pointer_cast(y + this->n), - // thrust::make_constant_iterator(beta), - // thrust::device_pointer_cast(y), - // thrust::multiplies()); - - // // Apply diagonal matrix - // dim3 gridDim, blockDim; - // gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - // gridDim.y = 1; - // gridDim.z = 1; - // blockDim.x = BLOCK_SIZE; - // blockDim.y = 1; - // blockDim.z = 1; - // diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - // cudaCheckError(); - - // // Apply adjacency matrix - // sparse_matrix_t::mv(-alpha, x, 1, y); + sparse_matrix_t::mv(-alpha, x, 1, y); } vector_t diagonal_; From ad36433dda9f2eaadf48d8dad52a7911ff7ac0b1 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 21:13:10 -0500 Subject: [PATCH 142/189] Fixed mv() for modularity matrix. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 286fe5e6d2..22c6416c4e 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -301,30 +301,55 @@ struct modularity_matrix_t : laplacian_matrix_t { index_type const nrows, index_type const nnz) : laplacian_matrix_t( raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) {} + nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } template modularity_matrix_t( handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) - : laplacian_matrix_t( - raft_handle, thrust_exec_policy, csr_view) {} + : laplacian_matrix_t(raft_handle, + thrust_exec_policy, csr_view) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { - //TODO: call cusparse::csrmv ... and more: + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // y = A*x + // + sparse_matrix_t::mv(alpha, x, 0, y); + value_type dot_res; + + // gamma = d'*x // - // // y = A*x - // sparse_matrix_t::mv(alpha, x, 0, y); - // value_type dot_res; - // // gamma = d'*x // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // // y = y -(gamma/edge_sum)*d - // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); + CUBLAS_CHECK(linalg::cublasdot( + cublas_h, n, laplacian_matrix_t::diagonal_.raw(), + 1, x, 1, &dot_res, stream)); + + // y = y -(gamma/edge_sum)*d + // + value_type gamma_ = -dot_res / edge_sum_; + CUBLAS_CHECK(linalg::cublasaxpy( + cublas_h, n, &gamma_, + laplacian_matrix_t::diagonal_.raw(), 1, y, 1, + stream)); } + + value_type edge_sum_; }; } // namespace matrix From 8ea55828df1550b15395eab3192a296714bbe771 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 22 Jun 2020 09:30:52 -0500 Subject: [PATCH 143/189] Updated CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9a391369f..457312f06b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # RAFT 0.15.0 (Date TBD) ## New Features +- PR #12: Spectral clustering. ## Improvements From 5474e99a1916902b1875143823baec6035226210 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 22 Jun 2020 18:38:52 -0500 Subject: [PATCH 144/189] Integrated raft error control from outer PR. --- cpp/include/raft/spectral/cluster_solvers.hpp | 10 +-- cpp/include/raft/spectral/eigen_solvers.hpp | 24 +++--- cpp/include/raft/spectral/kmeans.hpp | 57 ++++++++------- cpp/include/raft/spectral/lanczos.hpp | 73 ++++++++++--------- cpp/include/raft/spectral/lapack.hpp | 4 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 8 +- .../raft/spectral/modularity_maximization.hpp | 10 +-- cpp/include/raft/spectral/partition.hpp | 8 +- cpp/include/raft/spectral/spectral_util.hpp | 11 ++- .../spectral/{error_temp.hpp => warn_dbg.hpp} | 18 +---- 10 files changed, 107 insertions(+), 116 deletions(-) rename cpp/include/raft/spectral/{error_temp.hpp => warn_dbg.hpp} (65%) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index b19237d1a8..922ae7cfab 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -48,13 +48,13 @@ struct kmeans_solver_t { size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { - RAFT_EXPECT(obs != nullptr, "Null obs buffer."); - RAFT_EXPECT(codes != nullptr, "Null codes buffer."); + RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); + RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, - config_.seed)); + kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, + config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 97114661c5..056189dcba 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -49,13 +49,13 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - RAFT_TRY(computeSmallestEigenvectors( - handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, - config_.tol, config_.reorthogonalize, iters, eigVals, eigVecs, - config_.seed)); + computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, + eigVecs, config_.seed); return iters; } @@ -64,13 +64,13 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, - config_.maxIter, config_.restartIter, - config_.tol, config_.reorthogonalize, - iters, eigVals, eigVecs, config_.seed)); + computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, eigVecs, + config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 444bf2491a..db85e25dea 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -29,11 +29,12 @@ #include #include +#include #include #include -#include #include #include +#include namespace { @@ -346,7 +347,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -357,7 +358,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - thrust::device_pointer_cast(distsCumSum)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); @@ -435,7 +436,7 @@ static int initializeCentroids( // Choose first centroid thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); @@ -444,7 +445,7 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { @@ -457,19 +458,19 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Recompute minimum distances minDistances2<<>>(n, dists, dists + n, codes, i); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); computeClusterSizes<<>>(n, k, codes, clusterSizes); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } @@ -520,7 +521,7 @@ static int assignCentroids( gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); computeDistances<<>>(n, d, k, obs, centroids, dists); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Find centroid closest to each observation vector CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); @@ -532,7 +533,7 @@ static int assignCentroids( gridDim.z = 1; minDistances<<>>(n, k, dists, codes, clusterSizes); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Compute residual sum of squares *residual_host = @@ -599,31 +600,31 @@ static int updateCentroids(handle_t const& handle, // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::modulus()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::divides()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Divide sums by cluster size to get centroid matrix blockDim.x = WARP_SIZE; @@ -634,7 +635,7 @@ static int updateCentroids(handle_t const& handle, gridDim.z = 1; divideCentroids<<>>(d, k, clusterSizes, centroids); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } @@ -728,20 +729,20 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); *residual_host = thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } if (n <= k) { thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); if (n < k) CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, @@ -802,7 +803,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), thrust::device_pointer_cast(clusterSizes + k), 0) - thrust::device_pointer_cast(clusterSizes)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Check for convergence @@ -852,11 +853,11 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, using namespace matrix; // Check that parameters are valid - RAFT_EXPECT(n > 0, "invalid parameter (n<1)"); - RAFT_EXPECT(d > 0, "invalid parameter (d<1)"); - RAFT_EXPECT(k > 0, "invalid parameter (k<1)"); - RAFT_EXPECT(tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECT(maxiter >= 0, "invalid parameter (maxiter<0)"); + RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory vector_t clusterSizes(handle, k); diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 6a4a016e4f..8aa615c25d 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,11 +25,12 @@ #include #include +#include #include #include -#include #include #include +#include namespace raft { @@ -100,7 +101,7 @@ int performLanczosIteration( auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - RAFT_EXPECT(A != nullptr, "Null matrix pointer."); + RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); IndexType_ n = A->nrows_; @@ -672,11 +673,12 @@ int computeSmallestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -696,8 +698,8 @@ int computeSmallestEigenvectors( work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -706,16 +708,15 @@ int computeSmallestEigenvectors( // Random number generator curandGenerator_t randGen; // Initialize random number generator - CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); // FIXME: This is hard coded, which is good for unit testing... // but should really be a parameter so it could be // "random" for real runs and "fixed" for tests - CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); - // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/); + // Initialize initial Lanczos vector - CUDA_TRY( - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); ValueType_ normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); @@ -821,7 +822,7 @@ int computeSmallestEigenvectors( *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit - CUDA_TRY(curandDestroyGenerator(randGen)); + curandDestroyGenerator(randGen); return 0; } @@ -874,11 +875,12 @@ int computeSmallestEigenvectors( IndexType_ n = A.nrows_; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -987,11 +989,12 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -1021,11 +1024,10 @@ int computeLargestEigenvectors( // Random number generator curandGenerator_t randGen; // Initialize random number generator - CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector - CUDA_TRY( - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); ValueType_ normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); @@ -1141,7 +1143,7 @@ int computeLargestEigenvectors( *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit - CUDA_TRY(curandDestroyGenerator(randGen)); + curandDestroyGenerator(randGen); return 0; } @@ -1194,11 +1196,12 @@ int computeLargestEigenvectors(handle_t const &handle, IndexType_ n = A.nrows_; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 0dab3d57b2..4417640705 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include //for now; TODO: check if/where this `define` should be; // @@ -33,7 +33,7 @@ namespace raft { std::stringstream ss; \ ss << "Lapack error: argument number " << -status \ << " had an illegal value."; \ - RAFT_FAIL(ss.str()); \ + throw exception(ss.str()); \ } else if (status > 0) \ RAFT_FAIL("Lapack error: internal error."); \ } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 22c6416c4e..bd03038373 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,11 +15,11 @@ */ #pragma once +#include #include #include #include #include -#include #include #include @@ -143,8 +143,8 @@ struct sparse_matrix_t { bool symmetric = false) const { using namespace sparse; - RAFT_EXPECT(x != nullptr, "Null x buffer."); - RAFT_EXPECT(y != nullptr, "Null y buffer."); + RAFT_EXPECTS(x != nullptr, "Null x buffer."); + RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); @@ -281,7 +281,7 @@ struct laplacian_matrix_t : sparse_matrix_t { blockDim.z = 1; utils::diagmv<<>>(n, alpha, diagonal_.raw(), x, y); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Apply adjacency matrix // diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 8e198e515f..679b5ae7df 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -90,9 +90,9 @@ std::tuple modularity_maximization( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -121,7 +121,7 @@ std::tuple modularity_maximization( // notice that at this point the matrix has already been transposed, so we are scaling // columns scale_obs(nEigVecs, n, eigVecs); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Find partition clustering auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, @@ -151,7 +151,7 @@ void analyzeModularity(handle_t const &handle, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 746bb54b60..0c72694f07 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -70,9 +70,9 @@ std::tuple partition( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -137,7 +137,7 @@ void analyzePartition(handle_t const &handle, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 2cc38cbbf1..8f8eb3ad8b 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include #include @@ -110,7 +110,6 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { // launch scaling kernel (scale each column of obs by its norm) scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); return cudaSuccess; } @@ -133,7 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, mean = thrust::reduce( thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), @@ -141,7 +140,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::make_constant_iterator(mean), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); CUBLAS_CHECK( cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); @@ -154,7 +153,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::make_constant_iterator(std), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::divides()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Transpose eigenvector matrix @@ -213,7 +212,7 @@ bool construct_indicator(handle_t const& handle, thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(index)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Compute size of ith partition CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/warn_dbg.hpp similarity index 65% rename from cpp/include/raft/spectral/error_temp.hpp rename to cpp/include/raft/spectral/warn_dbg.hpp index 7d525ae5f1..406f1b7c7e 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -6,24 +6,12 @@ #define STRINGIFY_DETAIL(x) #x #define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) -///#define RAFT_EXPECT(cond, reason) -inline void RAFT_EXPECT(bool cond, std::string const& reason) { - if (!cond) throw std::runtime_error(reason.c_str()); -} - -#define RAFT_TRY(expression) (expression) - -//assume RAFT_FAIL() can take a std::string `reason` -// -#define RAFT_FAIL(reason) - -#define CUDA_TRY(call) (call) - -#define CUDA_CHECK_LAST() - #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) + +//nope: +// #define WARNING(message) \ do { \ std::stringstream ss; \ From 417581954daa1c1a6c906e225cb4be13a388ca9f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 24 Jun 2020 18:54:55 -0500 Subject: [PATCH 145/189] Replaced buggy Thrust call with simplified logic. --- cpp/include/raft/spectral/kmeans.hpp | 31 ++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index db85e25dea..ec5d1d67da 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -354,10 +354,33 @@ static int chooseNewCentroid(handle_t const& handle, // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) - obsIndex = (thrust::lower_bound( - thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), - thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - thrust::device_pointer_cast(distsCumSum)); + // + //seg-faults due to Thrust bug + //on binary-search-like algorithms + //when run with stream dependent + //execution policies; fixed on Thrust GitHub + //hence replace w/ linear interpolation, + //until the Thrust issue gets resolved: + // + // obsIndex = (thrust::lower_bound( + // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + // thrust::device_pointer_cast(distsCumSum)); + // + //linear interpolation logic: + //{ + ValueType_ minSum{0}; + CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(ValueType_), + cudaMemcpyDeviceToHost)); + if (distsSum > minSum) { + ValueType_ vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); + } else { + obsIndex = 0; + } + //} + CHECK_CUDA(stream); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); From 9fdf4d618cffa9d32be6a329bce0d7934ebded2b Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 1 Jul 2020 16:05:55 -0500 Subject: [PATCH 146/189] Removed useless graph.hpp dependency in spectral clustering. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 28 +++++++++---------- .../raft/spectral/modularity_maximization.hpp | 26 ++++++++--------- cpp/include/raft/spectral/partition.hpp | 26 ++++++++--------- cpp/test/cluster_solvers.cu | 10 +++---- cpp/test/eigen_solvers.cu | 14 +++++----- cpp/test/spectral_matrix.cu | 25 +++++++++++------ 6 files changed, 67 insertions(+), 62 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index bd03038373..1c78fd16fd 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -120,9 +119,8 @@ struct sparse_matrix_t { nrows_(nrows), nnz_(nnz) {} - sparse_matrix_t( - handle_t const& raft_handle, - GraphCSRView const& csr_view) + template + sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) : handle_(raft_handle), row_offsets_(csr_view.offsets), col_indices_(csr_view.indices), @@ -238,12 +236,14 @@ struct laplacian_matrix_t : sparse_matrix_t { } template - laplacian_matrix_t( - handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const& csr_view) - : sparse_matrix_t(raft_handle, csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices) { - vector_t ones{raft_handle, csr_view.number_of_vertices}; + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) + : sparse_matrix_t(raft_handle, csr_m.row_offsets_, + csr_m.col_indices_, csr_m.values_, + csr_m.nrows_, csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) { + vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(thrust_exec_policy, 1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); @@ -307,11 +307,11 @@ struct modularity_matrix_t : laplacian_matrix_t { } template - modularity_matrix_t( - handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const& csr_view) + modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_view) { + thrust_exec_policy, csr_m) { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( thrust_exec_policy); } diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 679b5ae7df..5ac33eda43 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -82,12 +82,11 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); @@ -100,13 +99,13 @@ std::tuple modularity_maximization( std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t n = graph.number_of_vertices; + vertex_t n = csr_m.nrows_; // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, thrust_exec_policy, graph}; + //sparse_matrix_t A{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -143,18 +142,17 @@ std::tuple modularity_maximization( * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template +template void analyzeModularity(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - edge_t i; - edge_t n = graph.number_of_vertices; + vertex_t i; + vertex_t n = csr_m.nrows_; weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); @@ -169,8 +167,8 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; // Initialize output modularity = 0; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 0c72694f07..841fca04d9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,12 +62,11 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template +template std::tuple partition( handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); @@ -80,7 +79,7 @@ std::tuple partition( std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t n = graph.number_of_vertices; + vertex_t n = csr_m.nrows_; // ------------------------------------------------------- // Spectral partitioner @@ -89,8 +88,8 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -130,17 +129,16 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template +template void analyzePartition(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - edge_t i; - edge_t n = graph.number_of_vertices; + vertex_t i; + vertex_t n = csr_m.nrows_; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -156,8 +154,8 @@ void analyzePartition(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; // Initialize output cost = 0; diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index d3d6a04312..04a94fbf22 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -89,16 +89,16 @@ TEST(Raft, ModularitySolvers) { kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - GraphCSRView empty_graph; + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( - h, t_exe_p, empty_graph, eig_solver, cluster_solver, clusters, eigvals, - eigvecs)); + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, empty_graph, k, - clusters, modularity)); + EXPECT_ANY_THROW( + spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index b96bda0b34..e6ee09262e 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -101,17 +101,17 @@ TEST(Raft, SpectralSolvers) { kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - GraphCSRView empty_graph; - auto t_exe_p = thrust::cuda::par.on(stream); - EXPECT_ANY_THROW(spectral::partition(h, t_exe_p, empty_graph, eig_solver, - cluster_solver, clusters, eigvals, - eigvecs)); + auto t_exe_p = thrust::cuda::par.on(stream); + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; + EXPECT_ANY_THROW(spectral::partition( + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, empty_graph, k, - clusters, edgeCut, cost)); + EXPECT_ANY_THROW( + spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 1052911c4f..e5c2d52764 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -22,7 +22,16 @@ #include namespace raft { - +namespace { +template +struct csr_view_t { + index_type* offsets; + index_type* indices; + value_type* edge_data; + index_type number_of_vertices; + index_type number_of_edges; +}; +} // namespace TEST(Raft, SpectralMatrices) { using namespace matrix; using index_type = int; @@ -32,18 +41,18 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); + csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; + int const sz = 10; vector_t d_v{h, sz}; - GraphCSRView empty_graph; - index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; - sparse_matrix_t sm2{h, empty_graph}; + sparse_matrix_t sm2{h, csr_v}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); @@ -56,8 +65,8 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, t_exe_pol, &empty_graph](void) { - laplacian_matrix_t lm2{h, t_exe_pol, empty_graph}; + auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { + laplacian_matrix_t lm2{h, t_exe_pol, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args @@ -67,8 +76,8 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, t_exe_pol, &empty_graph](void) { - modularity_matrix_t mm2{h, t_exe_pol, empty_graph}; + auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { + modularity_matrix_t mm2{h, t_exe_pol, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } From 28e9d4adfe604bac89c825d27cc5a545417d535f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 1 Jul 2020 16:10:32 -0500 Subject: [PATCH 147/189] Removed unnecessary graph.hpp. --- cpp/include/raft/graph.hpp | 550 ------------------------------------- 1 file changed, 550 deletions(-) delete mode 100644 cpp/include/raft/graph.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp deleted file mode 100644 index 089decc8ee..0000000000 --- a/cpp/include/raft/graph.hpp +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -/// #include // TODO: clarify what must be done about `comm` -#include -#include -#include - -#include -#include - -namespace raft { -namespace matrix { - -enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; - -struct GraphProperties { - bool directed{false}; - bool weighted{false}; - bool multigraph{false}; - bool bipartite{false}; - bool tree{false}; - PropType has_negative_edges{PropType::PROP_UNDEF}; - GraphProperties() = default; -}; - -enum class DegreeDirection { - IN_PLUS_OUT = 0, ///> Compute sum of in and out degree - IN, ///> Compute in degree - OUT, ///> Compute out degree - DEGREE_DIRECTION_COUNT -}; - -/** - * @brief Base class graphs, all but vertices and edges - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphViewBase { - public: - WT *edge_data; ///< edge weight - /// Comm comm; // TODO: clarify what must be done about `comm` - - GraphProperties prop; - - VT number_of_vertices; - ET number_of_edges; - - /** - * @brief Fill the identifiers array with the vertex identifiers. - * - * @param[out] identifier Pointer to device memory to store the vertex - * identifiers - */ - void get_vertex_identifiers(VT *identifiers) const; - /// void set_communicator(Comm &comm_) { comm = comm_; } // TODO: see above - - GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : edge_data(edge_data_), - /// comm(), // TODO: see above - prop(), - number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) {} - bool has_data(void) const { return edge_data != nullptr; } -}; - -/** - * @brief A graph stored in COO (COOrdinate) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCOOView : public GraphViewBase { - public: - VT *src_indices{nullptr}; ///< rowInd - VT *dst_indices{nullptr}; ///< colInd - - /** - * @brief Computes degree(in, out, in+out) of all the nodes of a Graph - * - * @throws cugraph::logic_error when an error occurs. - * - * @param[out] degree Device array of size V (V is number of vertices) initialized - * to zeros. Will contain the computed degree of every vertex. - * @param[in] direction IN_PLUS_OUT, IN or OUT - */ - void degree(ET *degree, DegreeDirection direction) const; - - /** - * @brief Default constructor - */ - GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} - - /** - * @brief Wrap existing arrays representing an edge list in a Graph. - * - * GraphCOOView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCOOView(VT *src_indices_, VT *dst_indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, - number_of_edges_), - src_indices(src_indices_), - dst_indices(dst_indices_) {} -}; - -/** - * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed - * Sparse Column) format - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCompressedSparseBaseView : public GraphViewBase { - public: - ET *offsets{nullptr}; ///< CSR offsets - VT *indices{nullptr}; ///< CSR indices - - /** - * @brief Fill the identifiers in the array with the source vertex - * identifiers - * - * @param[out] src_indices Pointer to device memory to store the - * source vertex identifiers - */ - void get_source_indices(VT *src_indices) const; - - /** - * @brief Computes degree(in, out, in+out) of all the nodes of a Graph - * - * @throws cugraph::logic_error when an error occurs. - * - * @param[out] degree Device array of size V (V is number of vertices) initialized - * to zeros. Will contain the computed degree of every vertex. - * @param[in] x Integer value indicating type of degree calculation - * 0 : in+out degree - * 1 : in-degree - * 2 : out-degree - */ - void degree(ET *degree, DegreeDirection direction) const; - - /** - * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCompressedSparseBaseView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, - number_of_edges_), - offsets{offsets_}, - indices{indices_} {} -}; - -/** - * @brief A graph stored in CSR (Compressed Sparse Row) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSRView : public GraphCompressedSparseBaseView { - public: - /** - * @brief Default constructor - */ - GraphCSRView() - : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, - 0) {} - - /** - * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSRView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { - } -}; - -/** - * @brief A graph stored in CSC (Compressed Sparse Column) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSCView : public GraphCompressedSparseBaseView { - public: - /** - * @brief Default constructor - */ - GraphCSCView() - : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, - 0) {} - - /** - * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. - * GraphCSCView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSCView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { - } -}; - -/** - * @brief TODO : Change this Take ownership of the provided graph arrays in COO format - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ -template -struct GraphCOOContents { - VT number_of_vertices; - ET number_of_edges; - std::unique_ptr src_indices; - std::unique_ptr dst_indices; - std::unique_ptr edge_data; -}; - -/** - * @brief A constructed graph stored in COO (COOrdinate) format. - * - * This class will src_indices and dst_indicies (until moved) - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCOO { - VT number_of_vertices_; - ET number_of_edges_; - rmm::device_buffer src_indices_{}; ///< rowInd - rmm::device_buffer dst_indices_{}; ///< colInd - rmm::device_buffer edge_data_{}; ///< CSR data - - public: - /** - * @brief Take ownership of the provided graph arrays in COO format - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCOO( - VT number_of_vertices, ET number_of_edges, bool has_data = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - src_indices_(sizeof(VT) * number_of_edges, stream, mr), - dst_indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - - GraphCOO( - GraphCOOView const &graph, cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(graph.number_of_vertices), - number_of_edges_(graph.number_of_edges), - src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), - stream, mr), - dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), - stream, mr) { - if (graph.has_data()) { - edge_data_ = rmm::device_buffer{ - graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; - } - } - - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - VT *src_indices(void) { return static_cast(src_indices_.data()); } - VT *dst_indices(void) { return static_cast(dst_indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } - - GraphCOOContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphCOOContents{ - number_of_vertices, number_of_edges, - std::make_unique(std::move(src_indices_)), - std::make_unique(std::move(dst_indices_)), - std::make_unique(std::move(edge_data_))}; - } - - GraphCOOView view(void) noexcept { - return GraphCOOView(src_indices(), dst_indices(), edge_data(), - number_of_vertices_, number_of_edges_); - } - - bool has_data(void) { return nullptr != edge_data_.data(); } -}; - -template -struct GraphSparseContents { - VT number_of_vertices; - ET number_of_edges; - std::unique_ptr offsets; - std::unique_ptr indices; - std::unique_ptr edge_data; -}; - -/** - * @brief Base class for constructted graphs stored in CSR (Compressed Sparse Row) format or - * CSC (Compressed Sparse Column) format - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCompressedSparseBase { - VT number_of_vertices_{0}; - ET number_of_edges_{0}; - rmm::device_buffer offsets_{}; ///< CSR offsets - rmm::device_buffer indices_{}; ///< CSR indices - rmm::device_buffer edge_data_{}; ///< CSR data - - bool has_data_{false}; - - public: - /** - * @brief Take ownership of the provided graph arrays in CSR/CSC format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCompressedSparseBase(VT number_of_vertices, ET number_of_edges, - bool has_data, cudaStream_t stream, - rmm::mr::device_memory_resource *mr) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), - indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - - GraphCompressedSparseBase(GraphSparseContents &&contents) - : number_of_vertices_(contents.number_of_vertices), - number_of_edges_(contents.number_of_edges), - offsets_(std::move(*contents.offsets.release())), - indices_(std::move(*contents.indices.release())), - edge_data_(std::move(*contents.edge_data.release())) {} - - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - ET *offsets(void) { return static_cast(offsets_.data()); } - VT *indices(void) { return static_cast(indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } - - GraphSparseContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphSparseContents{ - number_of_vertices, number_of_edges, - std::make_unique(std::move(offsets_)), - std::make_unique(std::move(indices_)), - std::make_unique(std::move(edge_data_))}; - } - - bool has_data(void) { return nullptr != edge_data_.data(); } -}; - -/** - * @brief A constructed graph stored in CSR (Compressed Sparse Row) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSR : public GraphCompressedSparseBase { - public: - /** - * @brief Default constructor - */ - GraphCSR() : GraphCompressedSparseBase() {} - - /** - * @brief Take ownership of the provided graph arrays in CSR format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSR( - VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} - - GraphCSR(GraphSparseContents &&contents) - : GraphCompressedSparseBase(std::move(contents)) {} - - GraphCSRView view(void) noexcept { - return GraphCSRView( - GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); - } -}; - -/** - * @brief A constructed graph stored in CSC (Compressed Sparse Column) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSC : public GraphCompressedSparseBase { - public: - /** - * @brief Default constructor - */ - GraphCSC() : GraphCompressedSparseBase() {} - - /** - * @brief Take ownership of the provided graph arrays in CSR format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSC( - VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} - - GraphCSC(GraphSparseContents &&contents) - : GraphCompressedSparseBase(contents) {} - - GraphCSCView view(void) noexcept { - return GraphCSCView( - GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); - } -}; - -} // namespace matrix -} // namespace raft From 87f6315d1d22a7cd7f5db5da15303f4bc1438ef9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:04:35 -0500 Subject: [PATCH 148/189] Addressed code reviews on kmeans dox. --- cpp/include/raft/spectral/kmeans.hpp | 145 +++++++++++++++++---------- 1 file changed, 91 insertions(+), 54 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index ec5d1d67da..53a1b1278a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -52,12 +52,14 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // CUDA kernels // ========================================================= -/// Compute distances between observation vectors and centroids -/** Block dimensions should be (warpSize, 1, - * blockSize/warpSize). Ideally, the grid is large enough so there - * are d threads in the x-direction, k threads in the y-direction, - * and n threads in the z-direction. - * +/** + * @brief Compute distances between observation vectors and centroids + * Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -130,10 +132,12 @@ static __global__ void computeDistances( } } -/// Find closest centroid to observation vectors -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Find closest centroid to observation vectors. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. * @param centroids (Input, d*k entries) Centroid matrix. Matrix is @@ -188,10 +192,12 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, } } -/// Check if newly computed distances are smaller than old distances -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Check if newly computed distances are smaller than old distances. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between * observation vectors and closest centroids. On exit, entries @@ -236,10 +242,11 @@ static __global__ void minDistances2(IndexType_ n, } } -/// Compute size of k-means clusters -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Compute size of k-means clusters. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. * @param n Number of observation vectors. * @param k Number of clusters. * @param codes (Input, n entries) Cluster assignments. @@ -257,15 +264,17 @@ static __global__ void computeClusterSizes( } } -/// Divide rows of centroid matrix by cluster sizes -/** Divides the ith column of the sum matrix by the size of the ith - * cluster. If the sum matrix has been initialized so that the ith - * row is the sum of all observation vectors in the ith cluster, - * this kernel produces cluster centroids. The grid and block - * dimensions should be 2-dimensional. Ideally the grid is large - * enough so there are d threads in the x-direction and k threads - * in the y-direction. - * +/** + * @brief Divide rows of centroid matrix by cluster sizes. + * Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. * @param clusterSizes (Input, k entries) Number of points in each @@ -309,9 +318,13 @@ static __global__ void divideCentroids( // Helper functions // ========================================================= -/// Randomly choose new centroids -/** Centroid is randomly chosen with k-means++ algorithm. - * +/** + * @brief Randomly choose new centroids. + * Centroid is randomly chosen with k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -393,9 +406,14 @@ static int chooseNewCentroid(handle_t const& handle, return 0; } -/// Choose initial cluster centroids for k-means algorithm -/** Centroids are randomly chosen with k-means++ algorithm - * +/** + * @brief Choose initial cluster centroids for k-means algorithm. + * Centroids are randomly chosen with k-means++ algorithm + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -498,9 +516,14 @@ static int initializeCentroids( return 0; } -/// Find cluster centroids closest to observation vectors -/** Distance is measured with Euclidean norm. - * +/** + * @brief Find cluster centroids closest to observation vectors. + * Distance is measured with Euclidean norm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -566,9 +589,14 @@ static int assignCentroids( return 0; } -/// Update cluster centroids for k-means algorithm -/** All clusters are assumed to be non-empty. - * +/** + * @brief Update cluster centroids for k-means algorithm. + * All clusters are assumed to be non-empty. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -671,11 +699,16 @@ namespace raft { // k-means algorithm // ========================================================= -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -701,6 +734,7 @@ namespace raft { * vectors and centroids). * @param iters_host (Output, host memory, 1 entry) Number of * k-means iterations. + * @param seed random seed to be used. * @return error flag. */ template @@ -778,8 +812,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, // Initialize cuBLAS CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm @@ -844,13 +877,16 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, return 0; } -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -864,7 +900,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * assignments. * @param residual On exit, residual sum of squares (sum of squares * of distances between observation vectors and centroids). - * @param On exit, number of k-means iterations. + * @param iters on exit, number of k-means iterations. + * @param seed random seed to be used. * @return error flag */ template From 98d7af62d07ff7a665ff4ddf90e813b293dc14fb Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:19:14 -0500 Subject: [PATCH 149/189] Addressed code reviews on kmeans lowercase_t types. --- cpp/include/raft/spectral/kmeans.hpp | 276 ++++++++++++++------------- 1 file changed, 143 insertions(+), 133 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 53a1b1278a..07c8748e1a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -59,7 +59,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * are d threads in the x-direction, k threads in the y-direction, * and n threads in the z-direction. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -75,20 +75,22 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * centroid. Matrix dimensions are n x k. Entries must be * initialized to zero. */ -template +template static __global__ void computeDistances( - IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists) { + index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { // Loop index - IndexType_ i; + index_type_t i; // Block indices - IndexType_ bidx; + index_type_t bidx; // Global indices - IndexType_ gidx, gidy, gidz; + index_type_t gidx, gidy, gidz; // Private memory - ValueType_ centroid_private, dist_private; + value_type_t centroid_private, dist_private; // Global x-index indicates index of vector entry bidx = blockIdx.x; @@ -137,7 +139,7 @@ static __global__ void computeDistances( * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. * @param centroids (Input, d*k entries) Centroid matrix. Matrix is @@ -153,20 +155,20 @@ static __global__ void computeDistances( * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ -template -static __global__ void minDistances(IndexType_ n, IndexType_ k, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) { +template +static __global__ void minDistances(index_type_t n, index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { // Loop index - IndexType_ i, j; + index_type_t i, j; // Current matrix entry - ValueType_ dist_curr; + value_type_t dist_curr; // Smallest entry in row - ValueType_ dist_min; - IndexType_ code_min; + value_type_t dist_min; + index_type_t code_min; // Each row in observation matrix is processed by a thread i = threadIdx.x + blockIdx.x * blockDim.x; @@ -197,7 +199,7 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between * observation vectors and closest centroids. On exit, entries @@ -211,18 +213,18 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, * centroid. * @param code_new Index associated with new centroid. */ -template -static __global__ void minDistances2(IndexType_ n, - ValueType_* __restrict__ dists_old, - const ValueType_* __restrict__ dists_new, - IndexType_* __restrict__ codes_old, - IndexType_ code_new) { +template +static __global__ void minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) { // Loop index - IndexType_ i; + index_type_t i; // Distances - ValueType_ dist_old_private; - ValueType_ dist_new_private; + value_type_t dist_old_private; + value_type_t dist_new_private; // Each row is processed by a thread i = threadIdx.x + blockIdx.x * blockDim.x; @@ -253,11 +255,11 @@ static __global__ void minDistances2(IndexType_ n, * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ -template +template static __global__ void computeClusterSizes( - IndexType_ n, IndexType_ k, const IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) { - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); i += blockDim.x * gridDim.x; @@ -274,7 +276,7 @@ static __global__ void computeClusterSizes( * enough so there are d threads in the x-direction and k threads * in the y-direction. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. * @param clusterSizes (Input, k entries) Number of points in each @@ -285,15 +287,15 @@ static __global__ void computeClusterSizes( * cluster. On exit, the matrix is the centroid matrix (each * column is the mean position of a cluster). */ -template +template static __global__ void divideCentroids( - IndexType_ d, IndexType_ k, const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids) { + index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { // Global indices - IndexType_ gidx, gidy; + index_type_t gidx, gidy; // Current cluster size - IndexType_ clusterSize_private; + index_type_t clusterSize_private; // Observation vector is determined by global y-index gidy = threadIdx.y + blockIdx.y * blockDim.y; @@ -322,8 +324,8 @@ static __global__ void divideCentroids( * @brief Randomly choose new centroids. * Centroid is randomly chosen with k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -339,19 +341,21 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ rand, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ dists, - ValueType_* __restrict__ centroid) { + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, + value_type_t rand, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ dists, + value_type_t* __restrict__ centroid) { // Cumulative sum of distances - ValueType_* distsCumSum = dists + n; + value_type_t* distsCumSum = dists + n; // Residual sum of squares - ValueType_ distsSum{0}; + value_type_t distsSum{0}; // Observation vector that is chosen as new centroid - IndexType_ obsIndex; + index_type_t obsIndex; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -361,7 +365,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), + CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost)); // Randomly choose observation vector @@ -382,13 +386,13 @@ static int chooseNewCentroid(handle_t const& handle, // //linear interpolation logic: //{ - ValueType_ minSum{0}; - CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(ValueType_), + value_type_t minSum{0}; + CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost)); if (distsSum > minSum) { - ValueType_ vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + value_type_t vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); } else { obsIndex = 0; } @@ -400,7 +404,7 @@ static int chooseNewCentroid(handle_t const& handle, // Record new centroid position CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); return 0; @@ -410,8 +414,8 @@ static int chooseNewCentroid(handle_t const& handle, * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -432,26 +436,27 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template +template static int initializeCentroids( - handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Loop index - IndexType_ i; + index_type_t i; // CUDA grid dimensions dim3 blockDim_warp, gridDim_warp, gridDim_block; // Random number generator thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); + thrust::uniform_real_distribution uniformDist(0, 1); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -472,7 +477,7 @@ static int initializeCentroids( gridDim_block.z = 1; // Assign observation vectors to code 0 - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), @@ -483,7 +488,7 @@ static int initializeCentroids( WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); @@ -496,7 +501,7 @@ static int initializeCentroids( WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); CHECK_CUDA(stream); @@ -508,7 +513,7 @@ static int initializeCentroids( } // Compute cluster sizes - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); computeClusterSizes<<>>(n, k, codes, clusterSizes); CHECK_CUDA(stream); @@ -520,8 +525,8 @@ static int initializeCentroids( * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -544,13 +549,14 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template +template static int assignCentroids( - handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) { + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -558,7 +564,7 @@ static int assignCentroids( auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors - CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; @@ -570,7 +576,7 @@ static int assignCentroids( CHECK_CUDA(stream); // Find centroid closest to each observation vector - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; @@ -593,8 +599,8 @@ static int assignCentroids( * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -615,23 +621,24 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template +template static int updateCentroids(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, - const ValueType_* __restrict__ obs, - const IndexType_* __restrict__ codes, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int) { + thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const index_type_t* __restrict__ codes, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -640,21 +647,21 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim, gridDim; // Device memory - thrust::device_ptr obs_copy(work); - thrust::device_ptr codes_copy(work_int); - thrust::device_ptr rows(work_int + d * n); + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (ValueType_*)NULL, n, + d, &zero, (value_type_t*)NULL, n, thrust::raw_pointer_cast(obs_copy), n, stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, - thrust::modulus()); + thrust::make_constant_iterator(n), rows, + thrust::modulus()); CHECK_CUDA(stream); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); @@ -664,8 +671,8 @@ static int updateCentroids(handle_t const& handle, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, - thrust::divides()); + thrust::make_constant_iterator(n), rows, + thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster @@ -705,8 +712,8 @@ namespace raft { * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -737,28 +744,30 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, - IndexType_ maxiter, const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int, ValueType_* residual_host, - IndexType_* iters_host, unsigned long long seed) { +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, + value_type_t* residual_host, index_type_t* iters_host, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Current iteration - IndexType_ iter; + index_type_t iter; // Residual sum of squares at previous iteration - ValueType_ residualPrev = 0; + value_type_t residualPrev = 0; // Random number generator thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); + thrust::uniform_real_distribution uniformDist(0, 1); // ------------------------------------------------------- // Initialization @@ -769,8 +778,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, // Trivial cases if (k == 1) { - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) @@ -783,7 +792,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, gridDim.y = 1; gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); @@ -803,8 +812,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, if (n < k) CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(IndexType_), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), + (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; @@ -837,7 +846,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - IndexType_ emptyCentroid = + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), thrust::device_pointer_cast(clusterSizes + k), 0) - @@ -883,8 +892,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -904,12 +913,13 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, - IndexType_ maxiter, const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, ValueType_& residual, - IndexType_& iters, unsigned long long seed = 123456) { +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, value_type_t& residual, + index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; // Check that parameters are valid @@ -920,13 +930,13 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory - vector_t clusterSizes(handle, k); - vector_t centroids(handle, d * k); - vector_t work(handle, n * max(k, d)); - vector_t work_int(handle, 2 * d * n); + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( + return kmeans( handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); From 7c79256a614b1e4a7e72f9788404858dc99c6e6f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:31:50 -0500 Subject: [PATCH 150/189] Addressed code reviews on kmeans cudaMemcpyAsync(). --- cpp/include/raft/spectral/kmeans.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 07c8748e1a..e5c0876211 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -365,8 +365,8 @@ static int chooseNewCentroid(handle_t const& handle, thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest @@ -387,8 +387,10 @@ static int chooseNewCentroid(handle_t const& handle, //linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / From 283fa0b3c6c83604d2b31f953f2eda1e4820c5c9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 17:20:58 -0500 Subject: [PATCH 151/189] Addressed code reviews on kmeans use of dim3{} cnstr. --- cpp/include/raft/spectral/kmeans.hpp | 72 +++++++++++++--------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index e5c0876211..c5c5e88b88 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -453,9 +453,6 @@ static int initializeCentroids( // Loop index index_type_t i; - // CUDA grid dimensions - dim3 blockDim_warp, gridDim_warp, gridDim_block; - // Random number generator thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); @@ -468,15 +465,14 @@ static int initializeCentroids( // ------------------------------------------------------- // Initialize grid dimensions - blockDim_warp.x = WARP_SIZE; - blockDim_warp.y = 1; - blockDim_warp.z = BSIZE_DIV_WSIZE; - gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim_warp.y = 1; - gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim_block.y = 1; - gridDim_block.z = 1; + dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; + + // CUDA grid dimensions + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535)}; + + // CUDA grid dimensions + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); @@ -559,20 +555,22 @@ static int assignCentroids( const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* residual_host) { - // CUDA grid dimensions - dim3 blockDim, gridDim; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim; + constexpr index_type_t grid_lower_bound{65535}; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); + gridDim.y = min(k, grid_lower_bound); + gridDim.z = + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); @@ -645,9 +643,6 @@ static int updateCentroids(handle_t const& handle, auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - // CUDA grid dimensions - dim3 blockDim, gridDim; - // Device memory thrust::device_ptr obs_copy(work); thrust::device_ptr codes_copy(work_int); @@ -687,12 +682,14 @@ static int updateCentroids(handle_t const& handle, CHECK_CUDA(stream); // Divide sums by cluster size to get centroid matrix - blockDim.x = WARP_SIZE; - blockDim.y = BLOCK_SIZE / WARP_SIZE; - blockDim.z = 1; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim.z = 1; + // + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; + + // CUDA grid dimensions + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535), 1}; + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); @@ -786,14 +783,13 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); - dim3 blockDim, gridDim; - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = 1; - gridDim.z = - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535)}; + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); From 4ff50686fa50d4a4ba4dc0b225aca11a3eb20433 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 17:32:50 -0500 Subject: [PATCH 152/189] Addressed code reviews on lowercase_t type names. --- cpp/include/raft/spectral/lanczos.hpp | 376 +++++++++++++------------- 1 file changed, 189 insertions(+), 187 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8aa615c25d..796369abc0 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -81,29 +81,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * Workspace. Not needed if full reorthogonalization is disabled. * @return Zero if successful. Otherwise non-zero. */ -template +template int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, - bool reorthogonalize, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t *iter, index_type_t maxIter, value_type_t shift, + value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - const ValueType_ one = 1; - const ValueType_ negOne = -1; - const ValueType_ zero = 0; - ValueType_ alpha; + const value_type_t one = 1; + const value_type_t negOne = -1; + const value_type_t zero = 0; + value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // ------------------------------------------------------- // Compute second Lanczos vector @@ -114,8 +115,8 @@ int performLanczosIteration( // Apply matrix if (shift != 0) CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, - stream)); + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector @@ -149,7 +150,7 @@ int performLanczosIteration( if (shift != 0) CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); @@ -165,7 +166,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(ValueType_), cudaMemcpyDeviceToHost, + sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CUBLAS_CHECK(cublasgemv( @@ -228,8 +229,9 @@ int performLanczosIteration( * @param P (Output, host memory, 9 entries) Householder transform * matrix. Matrix dimensions are 3 x 3. */ -template -static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { +template +static void findHouseholder3(value_type_t *v, value_type_t *Pv, + value_type_t *P) { // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -239,7 +241,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { v[0] -= *Pv; // Normalize Householder vector - ValueType_ normHouseholder = + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; @@ -252,7 +254,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { } // Construct Householder matrix - IndexType_ i, j; + index_type_t i, j; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; @@ -267,12 +269,12 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { * @param v (Input, host memory, 3 entries) Householder vector. * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ -template -static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { +template +static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Loop indices - IndexType_ i, j; + index_type_t i, j; // Dot product between Householder vector and matrix row/column - ValueType_ vDotA; + value_type_t vDotA; // Pre-apply Householder transform for (j = 0; j < 4; ++j) { @@ -307,31 +309,31 @@ static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { * @param work (Output, host memory, 3*n entries) Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int francisQRIteration(IndexType_ n, ValueType_ shift1, - ValueType_ shift2, ValueType_ *alpha, - ValueType_ *beta, ValueType_ *V, - ValueType_ *work) { +template +static int francisQRIteration(index_type_t n, value_type_t shift1, + value_type_t shift2, value_type_t *alpha, + value_type_t *beta, value_type_t *V, + value_type_t *work) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Temporary storage of 4x4 bulge and Householder vector - ValueType_ bulge[16]; + value_type_t bulge[16]; // Householder vector - ValueType_ householder[3]; + value_type_t householder[3]; // Householder matrix - ValueType_ householderMatrix[3 * 3]; + value_type_t householderMatrix[3 * 3]; // Shifts are roots of the polynomial p(x)=x^2+b*x+c - ValueType_ b = -shift1 - shift2; - ValueType_ c = shift1 * shift2; + value_type_t b = -shift1 - shift2; + value_type_t c = shift1 * shift2; // Loop indices - IndexType_ i, j, pos; + index_type_t i, j, pos; // Temporary variable - ValueType_ temp; + value_type_t temp; // ------------------------------------------------------- // Implementation @@ -341,20 +343,20 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, + householderMatrix); // Apply initial Householder transform to create bulge - memset(bulge, 0, 16 * sizeof(ValueType_)); + memset(bulge, 0, 16 * sizeof(value_type_t)); for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, - 0, work, n); - memcpy(V, work, 3 * n * sizeof(ValueType_)); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, + 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { @@ -374,12 +376,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + pos, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform @@ -397,12 +399,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + n - 4, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero @@ -412,12 +414,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, householder[2] = 0; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + n - 3, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; @@ -456,49 +458,49 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, * @param work_dev (Output, device memory, (n+iter)*iter entries) * Workspace. */ -template +template static int lanczosRestart( - handle_t const &handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, - ValueType_ *shiftUpper, ValueType_ *shiftLower, - ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, - ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, - bool smallest_eig) { + handle_t const &handle, index_type_t n, index_type_t iter, + index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, bool smallest_eig) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ zero = 0; - const ValueType_ one = 1; + const value_type_t zero = 0; + const value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Loop index - IndexType_ i; + index_type_t i; // Number of implicit restart steps // Assumed to be even since each call to Francis algorithm is // equivalent to two calls of QR algorithm - IndexType_ restartSteps = iter - iter_new; + index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - ValueType_ *ritzVals_host = work_host + 3 * iter; + value_type_t *ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - ValueType_ *shifts_host; + value_type_t *shifts_host; // Orthonormal matrix for similarity transform - ValueType_ *V_dev = work_dev + n * iter; + value_type_t *V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation // ------------------------------------------------------- // Compute Ritz values - memcpy(ritzVals_host, alpha_host, iter * sizeof(ValueType_)); - memcpy(work_host, beta_host, (iter - 1) * sizeof(ValueType_)); - Lapack::sterf(iter, ritzVals_host, work_host); + memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); + Lapack::sterf(iter, ritzVals_host, work_host); // Debug: Print largest eigenvalues // for (int i = iter-iter_new; i < iter; ++i) @@ -506,7 +508,7 @@ static int lanczosRestart( // std::cout <(M_PI) / restartSteps); + cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } @@ -544,7 +546,7 @@ static int lanczosRestart( WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = @@ -560,13 +562,13 @@ static int lanczosRestart( work_dev, n, stream)); CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, - n * iter_new * sizeof(ValueType_), + n * iter_new * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); @@ -630,45 +632,46 @@ static int lanczosRestart( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, - IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *shift, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; // Matrix dimension - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // Shift for implicit restart - ValueType_ shiftUpper; - ValueType_ shiftLower; + value_type_t shiftUpper; + value_type_t shiftLower; // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system // Status flags int status; // Loop index - IndexType_ i; + index_type_t i; // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); + // Lapack::check_lapack_enabled(); // ------------------------------------------------------- // Check that parameters are valid @@ -691,8 +694,8 @@ int computeSmallestEigenvectors( *totalIter = 0; // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); Z_host = Z_host_v.data(); work_host = work_host_v.data(); @@ -717,7 +720,7 @@ int computeSmallestEigenvectors( // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - ValueType_ normQ1; + value_type_t normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; @@ -725,22 +728,22 @@ int computeSmallestEigenvectors( // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure + // const value_type_t relError = 0.25; // Relative error + // const value_type_t failProb = 1e-4; // Probability of failure // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue - Lapack::sterf(*effIter, alpha_host, beta_host); + Lapack::sterf(*effIter, alpha_host, beta_host); *shift = -alpha_host[*effIter - 1]; // std::cout << *shift <( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -762,14 +765,14 @@ int computeSmallestEigenvectors( while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { // Determine number of restart steps // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; + index_type_t iter_new = nEigVecs + 1; if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) iter_new = restartIter - (maxIter - *totalIter); if ((restartIter - iter_new) % 2) iter_new -= 1; if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( + status = lanczosRestart( handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); if (status) WARNING("error in Lanczos implicit restart"); @@ -780,7 +783,7 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -794,12 +797,12 @@ int computeSmallestEigenvectors( // Solve tridiagonal system memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(ValueType_)); + (*effIter) * sizeof(value_type_t)); memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, - work_host); + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); // Obtain desired eigenvalues by applying shift for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; @@ -807,13 +810,13 @@ int computeSmallestEigenvectors( // Copy results to device memory CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), - nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // for (int i = 0; i < nEigVecs; ++i) //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} CUDA_TRY(cudaMemcpy(work_dev, Z_host, - (*effIter) * nEigVecs * sizeof(ValueType_), + (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis @@ -862,17 +865,17 @@ int computeSmallestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 1234567) { + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { using namespace spectral; // Matrix dimension - IndexType_ n = A.nrows_; + index_type_t n = A.nrows_; // Check that parameters are valid RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, @@ -883,19 +886,19 @@ int computeSmallestEigenvectors( RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method - IndexType_ effIter; - ValueType_ shift; + index_type_t effIter; + value_type_t shift; int status = computeSmallestEigenvectors( handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), @@ -950,41 +953,42 @@ int computeSmallestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, - IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; // Matrix dimension - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system // Status flags int status; // Loop index - IndexType_ i; + index_type_t i; // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); + // Lapack::check_lapack_enabled(); // ------------------------------------------------------- // Check that parameters are valid @@ -1007,8 +1011,8 @@ int computeLargestEigenvectors( *totalIter = 0; // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); Z_host = Z_host_v.data(); work_host = work_host_v.data(); @@ -1028,7 +1032,7 @@ int computeLargestEigenvectors( curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - ValueType_ normQ1; + value_type_t normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; @@ -1036,36 +1040,36 @@ int computeLargestEigenvectors( // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure + // const value_type_t relError = 0.25; // Relative error + // const value_type_t failProb = 1e-4; // Probability of failure // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos *effIter = 0; - ValueType_ shift_val = 0.0; - ValueType_ *shift = &shift_val; + value_type_t shift_val = 0.0; + value_type_t *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; // Apply Lanczos method until convergence - ValueType_ shiftLower = 1; - ValueType_ shiftUpper = -1; + value_type_t shiftLower = 1; + value_type_t shiftUpper = -1; while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { // Determine number of restart steps // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; + index_type_t iter_new = nEigVecs + 1; if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) iter_new = restartIter - (maxIter - *totalIter); if ((restartIter - iter_new) % 2) iter_new -= 1; if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( + status = lanczosRestart( handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); if (status) WARNING("error in Lanczos implicit restart"); @@ -1076,7 +1080,7 @@ int computeLargestEigenvectors( // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -1092,12 +1096,12 @@ int computeLargestEigenvectors( } // Solve tridiagonal system memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(ValueType_)); + (*effIter) * sizeof(value_type_t)); memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, - work_host); + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); // note: We need to pick the top nEigVecs eigenvalues // but effItter can be larger than nEigVecs @@ -1105,7 +1109,7 @@ int computeLargestEigenvectors( // matrix of size effIter. remember the array is sorted, so it is not needed for smallest // eigenvalues case because the first ones are the smallest ones - IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs; + index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; // Debug : print nEigVecs largest eigenvalues // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) @@ -1130,12 +1134,12 @@ int computeLargestEigenvectors( // skip smallest eigenvalue if needed CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpy( work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, @@ -1183,17 +1187,15 @@ int computeLargestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template -int computeLargestEigenvectors(handle_t const &handle, - sparse_matrix_t const &A, - IndexType_ nEigVecs, IndexType_ maxIter, - IndexType_ restartIter, ValueType_ tol, - bool reorthogonalize, IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 123456) { +template +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { // Matrix dimension - IndexType_ n = A.nrows_; + index_type_t n = A.nrows_; // Check that parameters are valid RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, @@ -1204,18 +1206,18 @@ int computeLargestEigenvectors(handle_t const &handle, RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method - IndexType_ effIter; + index_type_t effIter; int status = computeLargestEigenvectors( handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), From ff5ad230d561033788ddeb79db93bfb339550699 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 18:23:08 -0500 Subject: [PATCH 153/189] Addressed code reviews on replacing cudaDeviceSynchronize(). --- cpp/include/raft/spectral/lanczos.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 796369abc0..6a53879723 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -209,7 +209,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } - CUDA_TRY(cudaDeviceSynchronize()); + CUDA_TRY(cudaStreamSynchronize(stream)); return 0; } From 8b30cda3c512caae95ee85b70f5ea4a7484e9387 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 11:39:57 -0500 Subject: [PATCH 154/189] Addressed comments on @brief in lanczos. --- cpp/include/raft/spectral/kmeans.hpp | 22 ++-- cpp/include/raft/spectral/lanczos.hpp | 155 +++++++++++++++----------- 2 files changed, 100 insertions(+), 77 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index c5c5e88b88..9017d2b8d4 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -58,7 +58,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * blockSize/warpSize). Ideally, the grid is large enough so there * are d threads in the x-direction, k threads in the y-direction, * and n threads in the z-direction. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -138,7 +138,7 @@ static __global__ void computeDistances( * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. @@ -198,7 +198,7 @@ static __global__ void minDistances(index_type_t n, index_type_t k, * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between @@ -248,7 +248,7 @@ static __global__ void minDistances2(index_type_t n, * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @param n Number of observation vectors. * @param k Number of clusters. * @param codes (Input, n entries) Cluster assignments. @@ -275,7 +275,7 @@ static __global__ void computeClusterSizes( * dimensions should be 2-dimensional. Ideally the grid is large * enough so there are d threads in the x-direction and k threads * in the y-direction. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -323,7 +323,7 @@ static __global__ void divideCentroids( /** * @brief Randomly choose new centroids. * Centroid is randomly chosen with k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -415,7 +415,7 @@ static int chooseNewCentroid(handle_t const& handle, /** * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -522,7 +522,7 @@ static int initializeCentroids( /** * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -598,7 +598,7 @@ static int assignCentroids( /** * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -710,7 +710,7 @@ namespace raft { * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -889,7 +889,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 6a53879723..8a80706f48 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -55,9 +55,12 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/// Perform Lanczos iteration -/** Lanczos iteration is performed on a shifted matrix A+shift*I. - * +/** + * @brief Perform Lanczos iteration + * Lanczos iteration is performed on a shifted matrix A+shift*I. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param iter Pointer to current Lanczos iteration. On exit, the * variable is set equal to the final Lanczos iteration. @@ -214,12 +217,14 @@ int performLanczosIteration( return 0; } -/// Find Householder transform for 3-dimensional system -/** Given an input vector v=[x,y,z]', this function finds a - * Householder transform P such that P*v is a multiple of - * e_1=[1,0,0]'. The input vector v is overwritten with the - * Householder vector such that P=I-2*v*v'. - * +/** + * @brief Find Householder transform for 3-dimensional system + * Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param v (Input/output, host memory, 3 entries) Input * 3-dimensional vector. On exit, the vector is set to the * Householder vector. @@ -260,12 +265,14 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; } -/// Apply 3-dimensional Householder transform to 4 x 4 matrix -/** The Householder transform is pre-applied to the top three rows +/** + * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix + * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The * 4 x 4 matrix is intended to contain the bulge that is produced * in the Francis QR algorithm. - * + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param v (Input, host memory, 3 entries) Householder vector. * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ @@ -291,10 +298,12 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { } } -/// Perform one step of Francis QR algorithm -/** Equivalent to two steps of the classical QR algorithm on a - * tridiagonal matrix. - * +/** + * @brief Perform one step of Francis QR algorithm + * Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param n Matrix dimension. * @param shift1 QR algorithm shift. * @param shift2 QR algorithm shift. @@ -429,9 +438,12 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, return 0; } -/// Perform implicit restart of Lanczos algorithm -/** Shifts are Chebyshev nodes of unwanted region of matrix spectrum. - * +/** + * @brief Perform implicit restart of Lanczos algorithm + * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param n Matrix dimension. * @param iter Current Lanczos iteration. * @param iter_new Lanczos iteration after restart. @@ -457,6 +469,9 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * column-major matrix with dimensions n x (iter+1). * @param work_dev (Output, device memory, (n+iter)*iter entries) * Workspace. + * @param smallest_eig specifies whether smallest (true) or largest + * (false) eigenvalues are to be calculated. + * @return error flag. */ template static int lanczosRestart( @@ -586,17 +601,19 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -630,6 +647,7 @@ static int lanczosRestart( * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -829,19 +847,19 @@ int computeSmallestEigenvectors( return 0; } -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -863,6 +881,7 @@ int computeSmallestEigenvectors( * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -912,16 +931,18 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/// Compute largest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied. - * +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. @@ -951,6 +972,7 @@ int computeSmallestEigenvectors( * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -1151,19 +1173,19 @@ int computeLargestEigenvectors( return 0; } -/// Compute largest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -1185,6 +1207,7 @@ int computeLargestEigenvectors( * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template From 8bf8589504394afd73f92d1344c970454641e96c Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:06:52 -0500 Subject: [PATCH 155/189] Addressed comments on host memory pointers and constexpr one/zero. --- cpp/include/raft/spectral/lanczos.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8a80706f48..94340fd5e6 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -97,9 +97,9 @@ int performLanczosIteration( // ------------------------------------------------------- // Useful variables - const value_type_t one = 1; - const value_type_t negOne = -1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t negOne = -1; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); @@ -447,11 +447,11 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @param n Matrix dimension. * @param iter Current Lanczos iteration. * @param iter_new Lanczos iteration after restart. - * @param shiftUpper Pointer to upper bound for unwanted + * @param shiftUpper Pointer (host memory) to upper bound for unwanted * region. Value is ignored if less than *shiftLower. If a * stronger upper bound has been found, the value is updated on * exit. - * @param shiftLower Pointer to lower bound for unwanted + * @param shiftLower Pointer (host memory) to lower bound for unwanted * region. Value is ignored if greater than *shiftUpper. If a * stronger lower bound has been found, the value is updated on * exit. @@ -486,8 +486,8 @@ static int lanczosRestart( // ------------------------------------------------------- // Useful constants - const value_type_t zero = 0; - const value_type_t one = 1; + constexpr value_type_t zero = 0; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -663,8 +663,8 @@ int computeSmallestEigenvectors( using namespace spectral; // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; // Matrix dimension index_type_t n = A->nrows_; @@ -988,8 +988,8 @@ int computeLargestEigenvectors( using namespace spectral; // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; // Matrix dimension index_type_t n = A->nrows_; From f30e636f67e9eb222ec0539605e8896424bea649 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:15:22 -0500 Subject: [PATCH 156/189] Addressed comments on removing stale commented code. --- cpp/include/raft/spectral/lanczos.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 94340fd5e6..2513e55855 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -744,13 +744,6 @@ int computeSmallestEigenvectors( auto h_val = 1 / normQ1; CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const value_type_t relError = 0.25; // Relative error - // const value_type_t failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; @@ -1060,13 +1053,6 @@ int computeLargestEigenvectors( auto h_val = 1 / normQ1; CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const value_type_t relError = 0.25; // Relative error - // const value_type_t failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - // Obtain tridiagonal matrix with Lanczos *effIter = 0; value_type_t shift_val = 0.0; From 44ff8bf09a39b40809c88cf14b1be5a52f02d660 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:35:02 -0500 Subject: [PATCH 157/189] Addressed comments on removing stale commented code. --- cpp/include/raft/spectral/lanczos.hpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 2513e55855..e995b60778 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -756,14 +756,14 @@ int computeSmallestEigenvectors( Lapack::sterf(*effIter, alpha_host, beta_host); *shift = -alpha_host[*effIter - 1]; - // std::cout << *shift <( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -793,7 +793,7 @@ int computeSmallestEigenvectors( if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -822,10 +822,7 @@ int computeSmallestEigenvectors( // Copy results to device memory CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); - // for (int i = 0; i < nEigVecs; ++i) - //{ - // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; - //} + CUDA_TRY(cudaMemcpy(work_dev, Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); @@ -1057,7 +1054,7 @@ int computeLargestEigenvectors( *effIter = 0; value_type_t shift_val = 0.0; value_type_t *shift = &shift_val; - // maxIter_curr = min(maxIter, restartIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -1087,7 +1084,7 @@ int computeLargestEigenvectors( if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); From 04e8790a14f11cf532196343c1ba2c760f1e7023 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:39:02 -0500 Subject: [PATCH 158/189] Addressed comments on removing stale (fixed) FIXME comment. --- cpp/include/raft/spectral/lanczos.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index e995b60778..808f8a1e35 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -731,10 +731,7 @@ int computeSmallestEigenvectors( // Initialize random number generator curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); - // FIXME: This is hard coded, which is good for unit testing... - // but should really be a parameter so it could be - // "random" for real runs and "fixed" for tests - curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/); + curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); From 5de2234428ff499e6a5c4c57179ef82e8d787502 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:03:38 -0500 Subject: [PATCH 159/189] Addressed comment on async copies. --- cpp/include/raft/spectral/lanczos.hpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 808f8a1e35..f6c876a56e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -817,12 +817,14 @@ int computeSmallestEigenvectors( for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); - CUDA_TRY(cudaMemcpy(work_dev, Z_host, - (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, @@ -1134,14 +1136,17 @@ int computeLargestEigenvectors( // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync( + eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); // skip smallest eigenvector if needed - CUDA_TRY(cudaMemcpy( - work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host + (top_eigenparis_idx_offset * (*effIter)), + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + + CHECK_CUDA(cudaStreamSynchronize(stream)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, From b21d5ebf5b23bfe9777a938aa6187f13c43835f3 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:29:08 -0500 Subject: [PATCH 160/189] Addressed comments on removing outdated TODOs. --- cpp/include/raft/spectral/lanczos.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index f6c876a56e..483d900b45 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -900,7 +900,6 @@ int computeSmallestEigenvectors( value_type_t *alpha_host = alpha_host_v.data(); value_type_t *beta_host = beta_host_v.data(); - //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -1029,8 +1028,8 @@ int computeLargestEigenvectors( work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1220,7 +1219,6 @@ int computeLargestEigenvectors( value_type_t *alpha_host = alpha_host_v.data(); value_type_t *beta_host = beta_host_v.data(); - //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); From 9df3f9aa1fccbca4d82c1262322fe8185a9ed188 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:57:45 -0500 Subject: [PATCH 161/189] Addressed comments on removing dead code. --- cpp/include/raft/spectral/lapack.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 4417640705..d14bf05f37 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -103,14 +103,11 @@ class Lapack { // computes the QR factorization of a general matrix static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void qrf (int n, T *H, T *Q, T *R); - //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); static void geev(T *A, T *eigenvalues, int dim, int lda); static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr); From fb46b7711245fbc66389860821b5475ed0c2002e Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 14:13:46 -0500 Subject: [PATCH 162/189] Addressed comments on using dim3{} cnstr. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 1c78fd16fd..5f72da45b7 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -272,13 +272,10 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim, blockDim; - gridDim.x = std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; + dim3 gridDim{ + std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + + dim3 blockDim{BLOCK_SIZE, 1, 1}; utils::diagmv<<>>(n, alpha, diagonal_.raw(), x, y); CHECK_CUDA(stream); From d359d1acb4c94577d1f1529b1c317d0057098dd4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 15:12:15 -0500 Subject: [PATCH 163/189] Addressed comments on using cleaning-up modularity header. --- cpp/include/raft/spectral/modularity_maximization.hpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 5ac33eda43..a920eb39c9 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -31,9 +31,6 @@ #include #include -//#define COLLECT_TIME_STATISTICS 1 -//#undef COLLECT_TIME_STATISTICS - #ifdef COLLECT_TIME_STATISTICS #include #include @@ -97,14 +94,13 @@ std::tuple modularity_maximization( auto stream = handle.get_stream(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - //sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); @@ -167,7 +163,6 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - ///sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; // Initialize output @@ -183,10 +178,8 @@ void analyzeModularity(handle_t const &handle, // Record results modularity += partModularity; - // std::cout<< "partModularity " < Date: Tue, 7 Jul 2020 15:46:46 -0500 Subject: [PATCH 164/189] Addressed comments on cleaning-up spectral_util header. --- cpp/include/raft/spectral/spectral_util.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 8f8eb3ad8b..acf59f9d63 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -40,8 +40,7 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { From e5ef1a71dd01efffe824188d58f1fb62014fcc48 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:04:52 -0500 Subject: [PATCH 165/189] Addressed comments on cleaning-up cluster_solvers.cu. --- cpp/test/cluster_solvers.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 04a94fbf22..4ff6cdf5fa 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -29,8 +29,6 @@ TEST(Raft, ClusterSolvers) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); - ASSERT_EQ(0, h.get_device()); index_type maxiter{100}; value_type tol{1.0e-10}; From beaa4991a1bf897f7fedfdb1dcdd7bf0f6e98760 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:16:19 -0500 Subject: [PATCH 166/189] Addressed comments on more clean-up in lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 483d900b45..37719579cb 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -686,11 +686,6 @@ int computeSmallestEigenvectors( value_type_t *Z_host; // Eigenvectors in Lanczos basis value_type_t *work_host; // Workspace - // ------------------------------------------------------- - // Check that LAPACK is enabled - // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); - // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- From 52764ef97e3492d23c68a79532913645e27e0b38 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:35:32 -0500 Subject: [PATCH 167/189] Addressed comments on dim3{} and type lowercase_t in spectral_util. --- cpp/include/raft/spectral/spectral_util.hpp | 48 ++++++++++----------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index acf59f9d63..1b90ab959f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -28,11 +28,11 @@ namespace raft { namespace spectral { -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_* obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; +template +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, + value_type_t* obs) { + index_type_t i, j, k, index, mm; + value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension @@ -76,9 +76,9 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, } } -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; +template +index_type_t next_pow2(index_type_t n) { + index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float v = n - 1; @@ -90,25 +90,21 @@ IndexType_ next_pow2(IndexType_ n) { return v + 1; } -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; +template +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { + index_type_t p2m; // find next power of 2 - p2m = next_pow2(m); + p2m = next_pow2(m); // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + unsigned int xsize = max(2, min(p2m, 32)); + dim3 nthreads{xsize, 256 / xsize, 1}; + + dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); + scale_obs_kernel + <<>>(m, n, obs); return cudaSuccess; } @@ -176,16 +172,16 @@ namespace { /// Functor to generate indicator vectors /** For use in Thrust transform */ -template +template struct equal_to_i_op { - const IndexType_ i; + const index_type_t i; public: - equal_to_i_op(IndexType_ _i) : i(_i) {} + equal_to_i_op(index_type_t _i) : i(_i) {} template __host__ __device__ void operator()(Tuple_ t) { thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace From e8fc0fb081e5acb849037c4161d7ddb1da774139 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:49:24 -0500 Subject: [PATCH 168/189] Fixed duplicate in CHANGELOG.md. --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4695742c89..0e376f5f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,6 @@ ## New Features - Initial RAFT version - PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes -- PR #12: Spectral Clustering ## Bug Fixes - PR #5: Small build.sh fixes From 4b32a8be84a3799d3ba503548767fe4516c6e919 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 8 Jul 2020 12:31:21 -0400 Subject: [PATCH 169/189] Add ceildiv --- cpp/include/raft/cudart_utils.h | 11 +++++++++++ cpp/test/cudart_utils.cpp | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 2ca23ba539..46e4792d43 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -28,6 +28,17 @@ namespace raft { +/** + * @brief Provide a ceiling division operation ie. ceil(a / b) + * @tparam IntType supposed to be only integers for now! + */ +template +constexpr __host__ __device__ +std::enable_if_t::value, T> +ceildiv(T a, T b) { + return (a + b - 1) / b; +} + /** * @brief Exception thrown when a CUDA error is encountered. */ diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index c14d880efd..0f2e9f4e5c 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -27,4 +27,10 @@ TEST(Raft, Utils) { ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr))); } +TEST(Raft, ceildiv) { + ASSERT_EQ(raft::ceildiv(5, 3), 2); + ASSERT_EQ(raft::ceildiv(0, 3), 0); + ASSERT_EQ(raft::ceildiv(7, 8), 1); +} + } // namespace raft From 6bb78f8b5ebbb799c25f76e7db1052b99830fe23 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 8 Jul 2020 12:34:34 -0400 Subject: [PATCH 170/189] CHANGELOG fix --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58477e868f..744161e6d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #7: Migrating cuml comms -> raft comms_t - PR #15: add exception based error handling macros +- PR #29: Add ceildiv functionality ## Improvements - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM From 9433a4e93c5882ed84fddd86cfab379af2b1483a Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 8 Jul 2020 13:13:00 -0400 Subject: [PATCH 171/189] Clang format fix --- cpp/include/raft/cudart_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 46e4792d43..371ca7b62d 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -33,8 +33,7 @@ namespace raft { * @tparam IntType supposed to be only integers for now! */ template -constexpr __host__ __device__ -std::enable_if_t::value, T> +constexpr __host__ __device__ std::enable_if_t::value, T> ceildiv(T a, T b) { return (a + b - 1) / b; } From d378ac280e5950283b23b45135c0abacf345f9ea Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 8 Jul 2020 13:54:18 -0400 Subject: [PATCH 172/189] Created integer utils --- cpp/CMakeLists.txt | 1 + cpp/include/raft/cudart_utils.h | 10 -- cpp/include/raft/integer_utils.h | 161 +++++++++++++++++++++++++++++++ cpp/test/cudart_utils.cpp | 6 -- cpp/test/integer_utils.cpp | 37 +++++++ 5 files changed, 199 insertions(+), 16 deletions(-) create mode 100644 cpp/include/raft/integer_utils.h create mode 100644 cpp/test/integer_utils.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bff25da45..30c51f620c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -222,6 +222,7 @@ if(BUILD_RAFT_TESTS) add_executable(test_raft test/cudart_utils.cpp test/handle.cpp + test/integer_utils.cpp test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/test.cpp) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 371ca7b62d..2ca23ba539 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -28,16 +28,6 @@ namespace raft { -/** - * @brief Provide a ceiling division operation ie. ceil(a / b) - * @tparam IntType supposed to be only integers for now! - */ -template -constexpr __host__ __device__ std::enable_if_t::value, T> -ceildiv(T a, T b) { - return (a + b - 1) / b; -} - /** * @brief Exception thrown when a CUDA error is encountered. */ diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h new file mode 100644 index 0000000000..a7cfb9287b --- /dev/null +++ b/cpp/include/raft/integer_utils.h @@ -0,0 +1,161 @@ +/* + * Copyright 2019 BlazingDB, Inc. + * Copyright 2019 Eyal Rozenberg + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/** + * @file Utility code involving integer arithmetic + * + */ + +#include +#include + +namespace raft { +//! Utility functions +/** + * Finds the smallest integer not less than `number_to_round` and modulo `S` is + * zero. This function assumes that `number_to_round` is non-negative and + * `modulus` is positive. + */ +template +inline S round_up_safe(S number_to_round, S modulus) { + auto remainder = number_to_round % modulus; + if (remainder == 0) { + return number_to_round; + } + auto rounded_up = number_to_round - remainder + modulus; + if (rounded_up < number_to_round) { + throw std::invalid_argument( + "Attempt to round up beyond the type's maximum value"); + } + return rounded_up; +} + +/** + * Finds the largest integer not greater than `number_to_round` and modulo `S` is + * zero. This function assumes that `number_to_round` is non-negative and + * `modulus` is positive. + */ +template +inline S round_down_safe(S number_to_round, S modulus) { + auto remainder = number_to_round % modulus; + auto rounded_down = number_to_round - remainder; + return rounded_down; +} + +/** + * Divides the left-hand-side by the right-hand-side, rounding up + * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. + * + * @param dividend the number to divide + * @param divisor the number by which to divide + * @return The least integer multiple of {@link divisor} which is greater than or equal to + * the non-integral division dividend/divisor. + * + * @note sensitive to overflow, i.e. if dividend > std::numeric_limits::max() - divisor, + * the result will be incorrect + */ +template +constexpr inline S div_rounding_up_unsafe(const S& dividend, + const T& divisor) noexcept { + return (dividend + divisor - 1) / divisor; +} + +namespace detail { +template +constexpr inline I div_rounding_up_safe(std::integral_constant, + I dividend, I divisor) noexcept { + // TODO: This could probably be implemented faster + return (dividend > divisor) + ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); +} + +template +constexpr inline I div_rounding_up_safe(std::integral_constant, + I dividend, I divisor) noexcept { + auto quotient = dividend / divisor; + auto remainder = dividend % divisor; + return quotient + (remainder != 0); +} + +} // namespace detail + +/** + * Divides the left-hand-side by the right-hand-side, rounding up + * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. + * + * @param dividend the number to divide + * @param divisor the number of by which to divide + * @return The least integer multiple of {@link divisor} which is greater than or equal to + * the non-integral division dividend/divisor. + * + * @note will not overflow, and may _or may not_ be slower than the intuitive + * approach of using (dividend + divisor - 1) / divisor + */ +template +constexpr inline std::enable_if_t::value, I> +div_rounding_up_safe(I dividend, I divisor) noexcept { + using i_is_a_signed_type = + std::integral_constant::value>; + return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); +} + +template +constexpr inline std::enable_if_t::value, bool> +is_a_power_of_two(I val) noexcept { + return ((val - 1) & val) == 0; +} + +/** + * @brief Return the absolute value of a number. + * + * This calls `std::abs()` which performs equivalent: `(value < 0) ? -value : value`. + * + * This was created to prevent compile errors calling `std::abs()` with unsigned integers. + * An example compile error appears as follows: + * @code{.pseudo} + * error: more than one instance of overloaded function "std::abs" matches the argument list: + * function "abs(int)" + * function "std::abs(long)" + * function "std::abs(long long)" + * function "std::abs(double)" + * function "std::abs(float)" + * function "std::abs(long double)" + * argument types are: (uint64_t) + * @endcode + * + * Not all cases could be if-ed out using `std::is_signed::value` and satisfy the compiler. + * + * @param value Numeric value can be either integer or float type. + * @return Absolute value if value type is signed. + */ +template +std::enable_if_t::value, T> constexpr inline absolute_value( + T value) { + return std::abs(value); +} +// Unsigned type just returns itself. +template +std::enable_if_t::value, T> constexpr inline absolute_value( + T value) { + return value; +} + +} // namespace raft diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index 0f2e9f4e5c..c14d880efd 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -27,10 +27,4 @@ TEST(Raft, Utils) { ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr))); } -TEST(Raft, ceildiv) { - ASSERT_EQ(raft::ceildiv(5, 3), 2); - ASSERT_EQ(raft::ceildiv(0, 3), 0); - ASSERT_EQ(raft::ceildiv(7, 8), 1); -} - } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp new file mode 100644 index 0000000000..830d085a40 --- /dev/null +++ b/cpp/test/integer_utils.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft { + +TEST(Raft, rounding_up) { + ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); + ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); + ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); + ASSERT_EQ(raft::div_rounding_up_unsafe(5, 3), 2); + ASSERT_EQ(raft::div_rounding_up_unsafe(0, 3), 0); + ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); +} + +TEST(Raft, is_a_power_of_two) { + ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); + ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); +} + +} // namespace raft From aeb2e5617b301d8ded8dbde14f921b69557f5d5d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 14:55:34 -0500 Subject: [PATCH 173/189] Moved lanczos.hpp to linalg. --- cpp/include/raft/{spectral => linalg}/lanczos.hpp | 0 cpp/include/raft/spectral/eigen_solvers.hpp | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/include/raft/{spectral => linalg}/lanczos.hpp (100%) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp similarity index 100% rename from cpp/include/raft/spectral/lanczos.hpp rename to cpp/include/raft/linalg/lanczos.hpp diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 056189dcba..e36dca2e0c 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include namespace raft { From 1502821cbc747436bcd07bd5ddec54160a320259 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 15:20:22 -0500 Subject: [PATCH 174/189] Moved sm_utils.hpp to utils. --- cpp/include/raft/spectral/kmeans.hpp | 2 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 2 +- cpp/include/raft/spectral/spectral_util.hpp | 2 +- cpp/include/raft/{spectral => utils}/sm_utils.hpp | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename cpp/include/raft/{spectral => utils}/sm_utils.hpp (100%) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 9017d2b8d4..13f4d2c82a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include namespace { diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 5f72da45b7..08f213cd3a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 1b90ab959f..b77375b33b 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/utils/sm_utils.hpp similarity index 100% rename from cpp/include/raft/spectral/sm_utils.hpp rename to cpp/include/raft/utils/sm_utils.hpp From 2b5a6cd0e32e9fd36a8ff053ce14c0f3abd32184 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 17:16:14 -0500 Subject: [PATCH 175/189] Fixed CHECK_CUDA() calls. --- cpp/include/raft/linalg/lanczos.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 37719579cb..f5967b556c 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -819,7 +819,7 @@ int computeSmallestEigenvectors( CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 13f4d2c82a..10357671d6 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -389,7 +389,7 @@ static int chooseNewCentroid(handle_t const& handle, value_type_t minSum{0}; CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream);//cudaStreamSynchronize(stream)); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); From e2873503a5baf3e780bc18702217759506d1306b Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 17:25:56 -0500 Subject: [PATCH 176/189] Fixed CHECK_CUDA() redux. --- cpp/include/raft/linalg/lanczos.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index f5967b556c..b775a1f696 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -1140,7 +1140,7 @@ int computeLargestEigenvectors( (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 10357671d6..9e31c1ef5b 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -33,8 +33,8 @@ #include #include #include -#include #include +#include namespace { @@ -389,7 +389,7 @@ static int chooseNewCentroid(handle_t const& handle, value_type_t minSum{0}; CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(stream);//cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); From 13aa96e6ead7def67d058612e859fce854d8ae03 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 10:51:41 -0500 Subject: [PATCH 177/189] Addressed comments on in-place initializers and thrust exe policy dox. --- cpp/include/raft/spectral/kmeans.hpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 9e31c1ef5b..10670c2721 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -220,14 +220,13 @@ static __global__ void minDistances2(index_type_t n, index_type_t* __restrict__ codes_old, index_type_t code_new) { // Loop index - index_type_t i; + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; // Distances value_type_t dist_old_private; value_type_t dist_new_private; // Each row is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { // Get old and new distances dist_old_private = dists_old[i]; @@ -419,7 +418,8 @@ static int chooseNewCentroid(handle_t const& handle, * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -526,7 +526,8 @@ static int initializeCentroids( * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -602,7 +603,8 @@ static int assignCentroids( * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -714,7 +716,8 @@ namespace raft { * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -893,7 +896,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. From d513d21474f851db64c68095db1ea47eeeb1c740 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 11:15:38 -0500 Subject: [PATCH 178/189] Addressed comment on replacing 65535 by named constant. --- cpp/include/raft/spectral/kmeans.hpp | 36 ++++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 10670c2721..08913d41e7 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -419,7 +419,7 @@ static int chooseNewCentroid(handle_t const& handle, * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -460,6 +460,8 @@ static int initializeCentroids( auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + constexpr index_type_t grid_lower_bound{65535}; + // ------------------------------------------------------- // Implementation // ------------------------------------------------------- @@ -468,11 +470,13 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535)}; + dim3 gridDim_warp{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), + 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); @@ -527,7 +531,7 @@ static int initializeCentroids( * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -581,7 +585,7 @@ static int assignCentroids( blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); gridDim.y = 1; gridDim.z = 1; minDistances<<>>(n, k, dists, codes, @@ -604,7 +608,7 @@ static int assignCentroids( * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -642,6 +646,8 @@ static int updateCentroids(handle_t const& handle, const value_type_t one = 1; const value_type_t zero = 0; + constexpr index_type_t grid_lower_bound{65535}; + auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -689,8 +695,9 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535), 1}; + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; divideCentroids<<>>(d, k, clusterSizes, centroids); @@ -717,7 +724,7 @@ namespace raft { * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -764,6 +771,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Current iteration index_type_t iter; + constexpr index_type_t grid_lower_bound{65535}; + // Residual sum of squares at previous iteration value_type_t residualPrev = 0; @@ -790,8 +799,9 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), + grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, @@ -897,7 +907,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. From e16b9c413511913a112179c63ce637db5ed83459 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 19:21:17 -0500 Subject: [PATCH 179/189] Fixed a file inclusion style (use brackets) that was rejected in CI (quotes). --- cpp/include/raft/spectral/modularity_maximization.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index a920eb39c9..f8dfe5daa3 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -32,11 +32,11 @@ #include #ifdef COLLECT_TIME_STATISTICS +#include #include #include #include #include -#include "cuda_profiler_api.h" #endif #ifdef COLLECT_TIME_STATISTICS From 5b84170a2a81b6dacbabc704652eaed8ed62f6be Mon Sep 17 00:00:00 2001 From: sean-frye Date: Tue, 14 Jul 2020 15:21:56 -0700 Subject: [PATCH 180/189] update docker image --- ci/local/build.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/local/build.sh b/ci/local/build.sh index 5bd81fa042..0a48711ef7 100644 --- a/ci/local/build.sh +++ b/ci/local/build.sh @@ -1,6 +1,9 @@ #!/bin/bash -DOCKER_IMAGE="gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6" +GIT_DESCRIBE_TAG=`git describe --tags` +MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` + +DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7" REPO_PATH=${PWD} RAPIDS_DIR_IN_CONTAINER="/rapids" CPP_BUILD_DIR="cuML/build" From 56367bfc8c256ca2596948531a1b0264b029a56b Mon Sep 17 00:00:00 2001 From: sean-frye Date: Tue, 14 Jul 2020 15:25:53 -0700 Subject: [PATCH 181/189] update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 744161e6d6..70a9a1d575 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - PR #17: Make destructor inline to avoid redeclaration error - PR #25: Fix bug in handle_t::get_internal_streams - PR #26: Fix bug in RAFT_EXPECTS (add parentheses surrounding cond) +- PR #34 Fix issue with incorrect docker image being used in local build script # RAFT 0.14.0 (Date TBD) From 9411e44c95baf0ba93d4c9392dd81f52e2d7656c Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Jul 2020 22:23:20 -0500 Subject: [PATCH 182/189] removing nccl.h from error.hpp --- cpp/include/raft/error.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 4955556eed..5b20469da2 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include From 08a9fec537f40e382fddffb3cfcf50c5ad6b0e6a Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Jul 2020 22:25:27 -0500 Subject: [PATCH 183/189] updating changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d622dadb03..f31b865730 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - PR #25: Fix bug in handle_t::get_internal_streams - PR #26: Fix bug in RAFT_EXPECTS (add parentheses surrounding cond) - PR #34 Fix issue with incorrect docker image being used in local build script +- PR #35: Remove #include from `raft/error.hpp` # RAFT 0.14.0 (Date TBD) From 12f6e6bf5cff70dac5afa589cd83ce30caad75e8 Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 20 Jul 2020 18:24:08 -0500 Subject: [PATCH 184/189] removing cudart_utils include --- cpp/include/raft/mr/buffer_base.hpp | 1 - cpp/include/raft/mr/host/allocator.hpp | 1 - cpp/include/raft/mr/host/buffer.hpp | 1 - 3 files changed, 3 deletions(-) diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index f1d74d4b24..01b8a4605d 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 9ad6ea7532..56c16123df 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include namespace raft { diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index c26617e072..3c505bf2ed 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include From 6e4b632407173ec28ecd3a35318dadb0d36a5d10 Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 20 Jul 2020 18:27:13 -0500 Subject: [PATCH 185/189] updating Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f31b865730..776068e9e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ ## Improvements - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM - PR #22: Preserve order in comms workers for rank initialization +- PR #38: Remove #include from `raft/mr/` ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error From 05d43d1a53cc68a289608eedda98ed3dd31ef98c Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 20 Jul 2020 18:42:52 -0500 Subject: [PATCH 186/189] re-adding one --- cpp/include/raft/mr/buffer_base.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 01b8a4605d..f1d74d4b24 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include From 82e2fc34e5e0da213cb11a1bce7529f24217fb27 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 21 Jul 2020 00:35:05 -0500 Subject: [PATCH 187/189] making handle and comms polymorphic --- cpp/include/raft/comms/comms.hpp | 6 +++++- cpp/include/raft/handle.hpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 2770341097..df48946dfe 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include namespace raft { @@ -134,6 +133,11 @@ class comms_t { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } + /** + * Virtual Destructor to enable polymorphism + */ + virtual ~comms_t() {} + /** * Returns the size of the communicator clique */ diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 9af2e916bd..b5cb9f0a15 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -68,7 +68,7 @@ class handle_t { } /** Destroys all held-up resources */ - ~handle_t() { destroy_resources(); } + virtual ~handle_t() { destroy_resources(); } int get_device() const { return dev_id_; } From 4a812329f301d75551e74f15329195df5b2ffacf Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 21 Jul 2020 00:38:21 -0500 Subject: [PATCH 188/189] updating changelog --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 776068e9e8..8a1c7ab508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,14 @@ ## Improvements - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM - PR #22: Preserve order in comms workers for rank initialization -- PR #38: Remove #include from `raft/mr/` +- PR #38: Remove #include from `raft/mr/` +- PR #39: Adding a virtual destructor to `raft::handle_t` and `raft::comms::comms_t` ## Bug Fixes - PR #17: Make destructor inline to avoid redeclaration error - PR #25: Fix bug in handle_t::get_internal_streams - PR #26: Fix bug in RAFT_EXPECTS (add parentheses surrounding cond) -- PR #34 Fix issue with incorrect docker image being used in local build script +- PR #34: Fix issue with incorrect docker image being used in local build script - PR #35: Remove #include from `raft/error.hpp` # RAFT 0.14.0 (Date TBD) From 6501c9430f81aa560cbf3f97012d63a8b74b97a1 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 21 Jul 2020 01:00:11 -0500 Subject: [PATCH 189/189] re-add include --- cpp/include/raft/comms/comms.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index df48946dfe..50c2d7260d 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include namespace raft {