diff --git a/cpp/include/raft/common/detail/nvtx.cuh b/cpp/include/raft/common/detail/nvtx.cuh new file mode 100644 index 0000000000..c4df6d5554 --- /dev/null +++ b/cpp/include/raft/common/detail/nvtx.cuh @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace common { +namespace detail { + +#ifdef NVTX_ENABLED + +#include +#include +#include +#include +#include +#include + +/** + * @brief An internal struct to store associated state with the color + * generator + */ +struct ColorGenState { + /** collection of all tagged colors generated so far */ + static inline std::unordered_map allColors; + /** mutex for accessing the above map */ + static inline std::mutex mapMutex; + /** saturation */ + static inline constexpr float S = 0.9f; + /** value */ + static inline constexpr float V = 0.85f; + /** golden ratio */ + static inline constexpr float Phi = 1.61803f; + /** inverse golden ratio */ + static inline constexpr float InvPhi = 1.f / Phi; +}; + +// all h, s, v are in range [0, 1] +// Ref: http://en.wikipedia.org/wiki/HSL_and_HSV#Converting_to_RGB +inline uint32_t hsv2rgb(float h, float s, float v) +{ + uint32_t out = 0xff000000u; + if (s <= 0.0f) { return out; } + // convert hue from [0, 1] range to [0, 360] + float h_deg = h * 360.f; + if (0.f > h_deg || h_deg >= 360.f) h_deg = 0.f; + h_deg /= 60.f; + int h_range = (int)h_deg; + float h_mod = h_deg - h_range; + float x = v * (1.f - s); + float y = v * (1.f - (s * h_mod)); + float z = v * (1.f - (s * (1.f - h_mod))); + float r, g, b; + switch (h_range) { + case 0: + r = v; + g = z; + b = x; + break; + case 1: + r = y; + g = v; + b = x; + break; + case 2: + r = x; + g = v; + b = z; + break; + case 3: + r = x; + g = y; + b = v; + break; + case 4: + r = z; + g = x; + b = v; + break; + case 5: + default: + r = v; + g = x; + b = y; + break; + } + out |= (uint32_t(r * 256.f) << 16); + out |= (uint32_t(g * 256.f) << 8); + out |= uint32_t(b * 256.f); + return out; +} + +/** + * @brief Helper method to generate 'visually distinct' colors. + * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/ + * However, if an associated tag is passed, it will look up in its history for + * any generated color against this tag and if found, just returns it, else + * generates a new color, assigns a tag to it and stores it for future usage. + * Such a thing is very useful for nvtx markers where the ranges associated + * with a specific tag should ideally get the same color for the purpose of + * visualizing it on nsight-systems timeline. + * @param tag look for any previously generated colors with this tag or + * associate the currently generated color with it + * @return returns 32b RGB integer with alpha channel set of 0xff + */ +inline uint32_t generateNextColor(const std::string& tag) +{ + // std::unordered_map ColorGenState::allColors; + // std::mutex ColorGenState::mapMutex; + + std::lock_guard guard(ColorGenState::mapMutex); + if (!tag.empty()) { + auto itr = ColorGenState::allColors.find(tag); + if (itr != ColorGenState::allColors.end()) { return itr->second; } + } + float h = rand() * 1.f / RAND_MAX; + h += ColorGenState::InvPhi; + if (h >= 1.f) h -= 1.f; + auto rgb = hsv2rgb(h, ColorGenState::S, ColorGenState::V); + if (!tag.empty()) { ColorGenState::allColors[tag] = rgb; } + return rgb; +} + +static inline nvtxDomainHandle_t domain = nvtxDomainCreateA("raft"); + +inline void pushRange_name(const char* name) +{ + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = generateNextColor(name); + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name; + nvtxDomainRangePushEx(domain, &eventAttrib); +} + +template +inline void pushRange(const char* format, Args... args) +{ + if constexpr (sizeof...(args) > 0) { + int length = std::snprintf(nullptr, 0, format, args...); + assert(length >= 0); + auto buf = std::make_unique(length + 1); + std::snprintf(buf.get(), length + 1, format, args...); + pushRange_name(buf.get()); + } else { + pushRange_name(format); + } +} + +template +inline void pushRange(rmm::cuda_stream_view stream, const char* format, Args... args) +{ + stream.synchronize(); + pushRange(format, args...); +} + +inline void popRange() { nvtxDomainRangePop(domain); } + +inline void popRange(rmm::cuda_stream_view stream) +{ + stream.synchronize(); + popRange(); +} + +#else // NVTX_ENABLED + +template +inline void pushRange(const char* format, Args... args) +{ +} + +template +inline void pushRange(rmm::cuda_stream_view stream, const char* format, Args... args) +{ +} + +inline void popRange() {} + +inline void popRange(rmm::cuda_stream_view stream) {} + +#endif // NVTX_ENABLED + +} // namespace detail +} // namespace common +} // namespace raft diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp new file mode 100644 index 0000000000..8489fd749b --- /dev/null +++ b/cpp/include/raft/common/nvtx.hpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/nvtx.cuh" + +namespace raft { +namespace common { + +/** + * @brief Push a named nvtx range + * @param format range name format (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting + */ +template +inline void PUSH_RANGE(const char* format, Args... args) +{ + detail::pushRange(format, args...); +} + +/** + * @brief Synchronize CUDA stream and push a named nvtx range + * @param format range name format (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting + * @param stream stream to synchronize + */ +template +inline void PUSH_RANGE(rmm::cuda_stream_view stream, const char* format, Args... args) +{ + detail::pushRange(stream, format, args...); +} + +/** Pop the latest range */ +inline void POP_RANGE() { detail::popRange(); } + +/** + * @brief Synchronize CUDA stream and pop the latest nvtx range + * @param stream stream to synchronize + */ +inline void POP_RANGE(rmm::cuda_stream_view stream) { detail::popRange(stream); } + +/** Push a named nvtx range that would be popped at the end of the object lifetime. */ +class AUTO_RANGE { + private: + std::optional streamMaybe; + + /* This object is not meant to be touched. */ + AUTO_RANGE(const AUTO_RANGE&) = delete; + AUTO_RANGE(AUTO_RANGE&&) = delete; + AUTO_RANGE& operator=(const AUTO_RANGE&) = delete; + AUTO_RANGE& operator=(AUTO_RANGE&&) = delete; + + public: + /** + * Synchronize CUDA stream and push a named nvtx range + * At the end of the object lifetime, synchronize again and pop the range. + * + * @param stream stream to synchronize + * @param format range name format (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting + */ + template + AUTO_RANGE(rmm::cuda_stream_view stream, const char* format, Args... args) + : streamMaybe(std::make_optional(stream)) + { + PUSH_RANGE(stream, format, args...); + } + + /** + * Push a named nvtx range. + * At the end of the object lifetime, pop the range back. + * + * @param format range name format (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting + */ + template + AUTO_RANGE(const char* format, Args... args) : streamMaybe(std::nullopt) + { + PUSH_RANGE(format, args...); + } + + ~AUTO_RANGE() + { + if (streamMaybe.has_value()) + POP_RANGE(streamMaybe.value()); + else + POP_RANGE(); + } +}; + +/*! + \def RAFT_USING_RANGE(...) + When NVTX is enabled, push a named nvtx range and pop it at the end of the enclosing code block. + + This macro initializes a dummy AUTO_RANGE variable on the stack, + which pushes the range in its constructor and pops it in the destructor. +*/ +#ifdef NVTX_ENABLED +#define RAFT_USING_RANGE(...) raft::common::AUTO_RANGE _AUTO_RANGE_##__LINE__(__VA_ARGS__) +#else +#define RAFT_USING_RANGE(...) (void)0 +#endif + +} // namespace common +} // namespace raft diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index b228ac5499..2d25b85a50 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -46,7 +46,7 @@ void scatterImpl( { const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); scatterKernel<<>>(out, in, idx, len, op); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } /** diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 3091cd53a9..413763c07f 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,7 +32,7 @@ #include #include -#define MPI_TRY(call) \ +#define RAFT_MPI_TRY(call) \ do { \ int status = call; \ if (MPI_SUCCESS != status) { \ @@ -44,7 +44,12 @@ } \ } while (0) -#define MPI_TRY_NO_THROW(call) \ +// FIXME: Remove after consumer rename +#ifndef MPI_TRY +#define MPI_TRY(call) RAFT_MPI_TRY(call) +#endif + +#define RAFT_MPI_TRY_NO_THROW(call) \ do { \ int status = call; \ if (MPI_SUCCESS != status) { \ @@ -59,6 +64,11 @@ } \ } while (0) +// FIXME: Remove after consumer rename +#ifndef MPI_TRY_NO_THROW +#define MPI_TRY_NO_THROW(call) RAFT_MPI_TRY_NO_THROW(call) +#endif + namespace raft { namespace comms { @@ -98,24 +108,24 @@ class mpi_comms : public comms_iface { : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) { int mpi_is_initialized = 0; - MPI_TRY(MPI_Initialized(&mpi_is_initialized)); + RAFT_MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); - MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); - MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); + RAFT_MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); + RAFT_MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); // get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; - if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); - MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); + if (0 == rank_) RAFT_NCCL_TRY(ncclGetUniqueId(&id)); + RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); // initializing NCCL - NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); + RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } virtual ~mpi_comms() { // finalizing NCCL - NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } + RAFT_NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); + if (owns_mpi_comm_) { RAFT_MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } } int get_size() const { return size_; } @@ -125,11 +135,11 @@ class mpi_comms : public comms_iface { std::unique_ptr comm_split(int color, int key) const { MPI_Comm new_comm; - MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); + RAFT_MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); } - void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } + void barrier() const { RAFT_MPI_TRY(MPI_Barrier(mpi_comm_)); } void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const { @@ -142,7 +152,7 @@ class mpi_comms : public comms_iface { req_id = *it; free_requests_.erase(it); } - MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); + RAFT_MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); requests_in_flight_.insert(std::make_pair(req_id, mpi_req)); *request = req_id; } @@ -159,7 +169,7 @@ class mpi_comms : public comms_iface { free_requests_.erase(it); } - MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req)); + RAFT_MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req)); requests_in_flight_.insert(std::make_pair(req_id, mpi_req)); *request = req_id; } @@ -177,7 +187,7 @@ class mpi_comms : public comms_iface { free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); } - MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); + RAFT_MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } void allreduce(const void* sendbuff, @@ -187,13 +197,13 @@ class mpi_comms : public comms_iface { op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce( + RAFT_NCCL_TRY(ncclAllReduce( sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY( + RAFT_NCCL_TRY( ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } @@ -204,7 +214,7 @@ class mpi_comms : public comms_iface { int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast( + RAFT_NCCL_TRY(ncclBroadcast( sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } @@ -216,14 +226,14 @@ class mpi_comms : public comms_iface { int root, cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, - recvbuff, - count, - get_nccl_datatype(datatype), - get_nccl_op(op), - root, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } void allgather(const void* sendbuff, @@ -232,7 +242,7 @@ class mpi_comms : public comms_iface { datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather( + RAFT_NCCL_TRY(ncclAllGather( sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } @@ -246,7 +256,7 @@ class mpi_comms : public comms_iface { // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY( + RAFT_NCCL_TRY( ncclBroadcast(sendbuf, static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), recvcounts[root], @@ -265,19 +275,20 @@ class mpi_comms : public comms_iface { cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, - sendcount, - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY( + ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void gatherv(const void* sendbuff, @@ -290,19 +301,20 @@ class mpi_comms : public comms_iface { cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY( + ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void reducescatter(const void* sendbuff, @@ -312,13 +324,13 @@ class mpi_comms : public comms_iface { op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, - recvbuff, - recvcount, - get_nccl_datatype(datatype), - get_nccl_op(op), - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } status_t sync_stream(cudaStream_t stream) const @@ -357,13 +369,13 @@ class mpi_comms : public comms_iface { // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const { - NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const { - NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } void device_sendrecv(const void* sendbuf, @@ -375,10 +387,10 @@ class mpi_comms : public comms_iface { cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock - NCCL_TRY(ncclGroupStart()); - NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, @@ -392,24 +404,24 @@ class mpi_comms : public comms_iface { cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], - ncclUint8, - dests[i], - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], - ncclUint8, - sources[i], - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, + stream)); } - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY(ncclGroupEnd()); } private: diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 1647c29667..6afb0f56c6 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -130,7 +130,7 @@ class std_comms : public comms_iface { update_host(h_colors.data(), d_colors.data(), get_size(), stream_); update_host(h_keys.data(), d_keys.data(), get_size(), stream_); - CUDA_CHECK(cudaStreamSynchronize(stream_)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_)); std::vector subcomm_ranks{}; std::vector new_ucx_ptrs{}; @@ -144,7 +144,7 @@ class std_comms : public comms_iface { ncclUniqueId id{}; if (get_rank() == subcomm_ranks[0]) { // root of the new subcommunicator - NCCL_TRY(ncclGetUniqueId(&id)); + RAFT_NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); @@ -159,7 +159,7 @@ class std_comms : public comms_iface { barrier(); ncclComm_t nccl_comm; - NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); + RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { auto eps_sp = std::make_shared(new_ucx_ptrs.data()); @@ -178,8 +178,8 @@ class std_comms : public comms_iface { void barrier() const { - CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); - CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); + RAFT_CUDA_TRY(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); + RAFT_CUDA_TRY(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_); @@ -304,13 +304,13 @@ class std_comms : public comms_iface { op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce( + RAFT_NCCL_TRY(ncclAllReduce( sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY( + RAFT_NCCL_TRY( ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } @@ -321,7 +321,7 @@ class std_comms : public comms_iface { int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast( + RAFT_NCCL_TRY(ncclBroadcast( sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } @@ -333,14 +333,14 @@ class std_comms : public comms_iface { int root, cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, - recvbuff, - count, - get_nccl_datatype(datatype), - get_nccl_op(op), - root, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } void allgather(const void* sendbuff, @@ -349,7 +349,7 @@ class std_comms : public comms_iface { datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather( + RAFT_NCCL_TRY(ncclAllGather( sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } @@ -364,13 +364,13 @@ class std_comms : public comms_iface { // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], - get_nccl_datatype(datatype), - root, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } @@ -382,19 +382,20 @@ class std_comms : public comms_iface { cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, - sendcount, - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY( + ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void gatherv(const void* sendbuff, @@ -407,19 +408,20 @@ class std_comms : public comms_iface { cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY( + ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void reducescatter(const void* sendbuff, @@ -429,13 +431,13 @@ class std_comms : public comms_iface { op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, - recvbuff, - recvcount, - get_nccl_datatype(datatype), - get_nccl_op(op), - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } status_t sync_stream(cudaStream_t stream) const @@ -474,13 +476,13 @@ class std_comms : public comms_iface { // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const { - NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const { - NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } void device_sendrecv(const void* sendbuf, @@ -492,10 +494,10 @@ class std_comms : public comms_iface { cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock - NCCL_TRY(ncclGroupStart()); - NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + RAFT_NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, @@ -509,24 +511,24 @@ class std_comms : public comms_iface { cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock - NCCL_TRY(ncclGroupStart()); + RAFT_NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], - ncclUint8, - dests[i], - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], - ncclUint8, - sources[i], - nccl_comm_, - stream)); + RAFT_NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, + stream)); } - NCCL_TRY(ncclGroupEnd()); + RAFT_NCCL_TRY(ncclGroupEnd()); } private: diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 5f87bf41fa..93b57b13a0 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -46,13 +46,13 @@ bool test_collective_allreduce(const handle_t& handle, int root) cudaStream_t stream = handle.get_stream(); rmm::device_scalar temp_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); communicator.barrier(); std::cout << "Clique size: " << communicator.get_size() << std::endl; @@ -79,13 +79,15 @@ bool test_collective_broadcast(const handle_t& handle, int root) rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY( + cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY( + cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); communicator.barrier(); std::cout << "Clique size: " << communicator.get_size() << std::endl; @@ -111,13 +113,14 @@ bool test_collective_reduce(const handle_t& handle, int root) rmm::device_scalar temp_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY( + cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); communicator.barrier(); std::cout << "Clique size: " << communicator.get_size() << std::endl; @@ -147,14 +150,14 @@ bool test_collective_allgather(const handle_t& handle, int root) rmm::device_scalar temp_d(stream); rmm::device_uvector recv_d(communicator.get_size(), stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); int temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); communicator.barrier(); std::cout << "Clique size: " << communicator.get_size() << std::endl; @@ -185,16 +188,16 @@ bool test_collective_gather(const handle_t& handle, int root) rmm::device_uvector recv_d(communicator.get_rank() == root ? communicator.get_size() : 0, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { if (temp_h[i] != i) return false; @@ -229,7 +232,7 @@ bool test_collective_gatherv(const handle_t& handle, int root) rmm::device_uvector recv_d(communicator.get_rank() == root ? displacements.back() : 0, stream); - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gatherv( @@ -244,12 +247,12 @@ bool test_collective_gatherv(const handle_t& handle, int root) if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), - recv_d.data(), - sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, - stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(temp_h.data(), + recv_d.data(), + sizeof(int) * displacements.back(), + cudaMemcpyDeviceToHost, + stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { if (std::count_if(temp_h.begin() + displacements[i], @@ -280,14 +283,15 @@ bool test_collective_reducescatter(const handle_t& handle, int root) rmm::device_uvector temp_d(sends.size(), stream); rmm::device_scalar recv_d(stream); - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY( + cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); communicator.barrier(); std::cout << "Clique size: " << communicator.get_size() << std::endl; diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index 1b0548fc00..ef16773c75 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,7 +26,7 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ +#define RAFT_NCCL_TRY(call) \ do { \ ncclResult_t const status = (call); \ if (ncclSuccess != status) { \ @@ -41,7 +41,12 @@ } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ +// FIXME: Remove after consumer rename +#ifndef NCCL_TRY +#define NCCL_TRY(call) RAFT_NCCL_TRY(call) +#endif + +#define RAFT_NCCL_TRY_NO_THROW(call) \ do { \ ncclResult_t status = call; \ if (ncclSuccess != status) { \ @@ -49,6 +54,11 @@ } \ } while (0) +// FIXME: Remove after consumer rename +#ifndef NCCL_TRY_NO_THROW +#define NCCL_TRY_NO_THROW(call) RAFT_NCCL_TRY_NO_THROW(call) +#endif + namespace raft { namespace comms { diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index cf06416a96..1464cd070e 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -53,8 +53,7 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ -#ifndef CUDA_TRY -#define CUDA_TRY(call) \ +#define RAFT_CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ if (status != cudaSuccess) { \ @@ -69,7 +68,12 @@ struct cuda_error : public raft::exception { throw raft::cuda_error(msg); \ } \ } while (0) + +// FIXME: Remove after consumers rename +#ifndef CUDA_TRY +#define CUDA_TRY(call) RAFT_CUDA_TRY(call) #endif + /** * @brief Debug macro to check for CUDA errors * @@ -84,23 +88,26 @@ struct cuda_error : public raft::exception { * asynchronous kernel launch. */ #ifndef NDEBUG -#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); +#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); #else -#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); +#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError()); #endif -/** FIXME: temporary alias for cuML compatibility */ +// FIXME: Remove after consumers rename +#ifndef CHECK_CUDA +#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call) +#endif + +/** FIXME: remove after cuml rename */ #ifndef CUDA_CHECK -#define CUDA_CHECK(call) CUDA_TRY(call) +#define CUDA_CHECK(call) RAFT_CUDA_TRY(call) #endif -///@todo: enable this only after we have added logging support in raft // /** // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ -#ifndef CUDA_CHECK_NO_THROW -#define CUDA_CHECK_NO_THROW(call) \ +#define RAFT_CUDA_TRY_NO_THROW(call) \ do { \ cudaError_t const status = call; \ if (cudaSuccess != status) { \ @@ -111,6 +118,10 @@ struct cuda_error : public raft::exception { cudaGetErrorString(status)); \ } \ } while (0) + +// FIXME: Remove after cuml rename +#ifndef CUDA_CHECK_NO_THROW +#define CUDA_CHECK_NO_THROW(call) RAFT_CHECK_CUDA_NO_THROW(call) #endif /** @@ -118,8 +129,6 @@ struct cuda_error : public raft::exception { * TODO: Rename original implementations in 22.04 to fix * https://github.com/rapidsai/raft/issues/128 */ -#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) -#define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call) namespace raft { @@ -328,9 +337,9 @@ inline void deallocate_all(rmm::cuda_stream_view stream) inline int getSharedMemPerBlock() { int devId; - CUDA_CHECK(cudaGetDevice(&devId)); + RAFT_CUDA_TRY(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } @@ -338,9 +347,9 @@ inline int getSharedMemPerBlock() inline int getMultiProcessorCount() { int devId; - CUDA_CHECK(cudaGetDevice(&devId)); + RAFT_CUDA_TRY(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } @@ -352,7 +361,7 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea T* arr_h = (T*)malloc(size * sizeof(T)); update_host(arr_h, arr, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); ss << name << " = [ "; for (int i = 0; i < size; i++) { diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh index 46edf0bf47..6be994b80a 100644 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ b/cpp/include/raft/distance/detail/canberra.cuh @@ -118,7 +118,7 @@ static void canberraImpl(const DataT* x, x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template distImpl; distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -635,7 +635,7 @@ void distance(const InType* x, auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; distance( x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh index 1166543f8c..a8deb8df24 100644 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ b/cpp/include/raft/distance/detail/euclidean.cuh @@ -138,7 +138,7 @@ void euclideanExpImpl(const DataT* x, x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template <<>>(min, m, maxVal, redOp); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; @@ -328,7 +328,7 @@ void fusedL2NNImpl(OutT* min, min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } } // namespace detail diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh index 886b9d1426..bed9d09e3e 100644 --- a/cpp/include/raft/distance/detail/hamming.cuh +++ b/cpp/include/raft/distance/detail/hamming.cuh @@ -123,7 +123,7 @@ static void hammingUnexpandedImpl(const DataT* x, x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template (m, P::Mblk); diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh index d4fbb039e7..5d516e7830 100644 --- a/cpp/include/raft/distance/detail/russell_rao.cuh +++ b/cpp/include/raft/distance/detail/russell_rao.cuh @@ -124,7 +124,7 @@ static void russellRaoImpl(const DataT* x, x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template int { int cur_dev = -1; - CUDA_CHECK(cudaGetDevice(&cur_dev)); + RAFT_CUDA_TRY(cudaGetDevice(&cur_dev)); return cur_dev; }()) { @@ -104,7 +104,7 @@ class handle_t { { std::lock_guard _(mutex_); if (!cublas_initialized_) { - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + RAFT_CUBLAS_TRY(cublasCreate(&cublas_handle_)); cublas_initialized_ = true; } return cublas_handle_; @@ -114,7 +114,7 @@ class handle_t { { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); + RAFT_CUSOLVER_TRY(cusolverDnCreate(&cusolver_dn_handle_)); cusolver_dn_initialized_ = true; } return cusolver_dn_handle_; @@ -124,7 +124,7 @@ class handle_t { { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { - CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); + RAFT_CUSOLVER_TRY(cusolverSpCreate(&cusolver_sp_handle_)); cusolver_sp_initialized_ = true; } return cusolver_sp_handle_; @@ -134,7 +134,7 @@ class handle_t { { std::lock_guard _(mutex_); if (!cusparse_initialized_) { - CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); + RAFT_CUSPARSE_TRY(cusparseCreate(&cusparse_handle_)); cusparse_initialized_ = true; } return cusparse_handle_; @@ -173,17 +173,17 @@ class handle_t { void wait_on_user_stream() const { - CUDA_CHECK(cudaEventRecord(event_, user_stream_)); + RAFT_CUDA_TRY(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { - CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } void wait_on_internal_streams() const { for (int i = 0; i < get_num_internal_streams(); i++) { - CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); - CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); + RAFT_CUDA_TRY(cudaEventRecord(event_, get_internal_stream(i))); + RAFT_CUDA_TRY(cudaStreamWaitEvent(user_stream_, event_, 0)); } } @@ -218,7 +218,7 @@ class handle_t { { std::lock_guard _(mutex_); if (!device_prop_initialized_) { - CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); + RAFT_CUDA_TRY(cudaGetDeviceProperties(&prop_, dev_id_)); device_prop_initialized_ = true; } return prop_; @@ -245,29 +245,19 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } + void create_resources() + { + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } void destroy_resources() { ///@todo: enable *_NO_THROW variants once we have enabled logging - if (cusparse_initialized_) { - // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); - CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); - } - if (cusolver_dn_initialized_) { - // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); - } - if (cusolver_sp_initialized_) { - // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); - CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); - } - if (cublas_initialized_) { - // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); - CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - } - // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); - CUDA_CHECK(cudaEventDestroy(event_)); + if (cusparse_initialized_) { RAFT_CUSPARSE_TRY(cusparseDestroy(cusparse_handle_)); } + if (cusolver_dn_initialized_) { RAFT_CUSOLVER_TRY(cusolverDnDestroy(cusolver_dn_handle_)); } + if (cusolver_sp_initialized_) { RAFT_CUSOLVER_TRY(cusolverSpDestroy(cusolver_sp_handle_)); } + if (cublas_initialized_) { RAFT_CUBLAS_TRY(cublasDestroy(cublas_handle_)); } + RAFT_CUDA_TRY(cudaEventDestroy(event_)); } }; // class handle_t diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index a2e29952d7..4e9e993b78 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -101,7 +101,7 @@ void getOvrlabels( n, [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, stream); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } // TODO: add one-versus-one selection: select two classes, relabel them to diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index 1ee0659b0d..9818b5d71b 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -137,20 +137,20 @@ void merge_labels(value_idx* labels_a, // Step 1: compute connected components in the label equivalence graph bool host_m; do { - CUDA_CHECK(cudaMemsetAsync(m, false, sizeof(bool), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(m, false, sizeof(bool), stream)); propagate_label_kernel <<>>(labels_a, labels_b, R, mask, m, N); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); raft::update_host(&host_m, m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } while (host_m); // Step 2: re-assign minimum equivalent label reassign_label_kernel <<>>(labels_a, labels_b, R, N, MAX_LABEL); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // namespace label diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 11d3174951..926cc44197 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -94,7 +94,7 @@ void addDevScalar(math_t* outDev, dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index a49a433941..00a2af0014 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -49,7 +49,7 @@ void binaryOpImpl( const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index 4b58133ac5..31e3a99a81 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -73,7 +73,7 @@ namespace linalg { * // Calculate a new row/column of matrix A into A_new * // ... * // Copy new row to L[rank-1,:] - * CUBLAS_CHECK(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, + * RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, * L + n - 1, ld_L, stream)); * // Update Cholesky factorization * MLCommon::LinAlg::choleskyRank1Update( @@ -171,38 +171,38 @@ void choleskyRank1Update(const raft::handle_t& handle, // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; A_new = reinterpret_cast(workspace); - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), - CUBLAS_SIDE_LEFT, - uplo, - op, - CUBLAS_DIAG_NON_UNIT, - n - 1, - 1, - &alpha, - L, - ld, - A_new, - n - 1, - stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case - CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); } // L_22 = sqrt(A_22 - L_12 * L_12) @@ -210,7 +210,7 @@ void choleskyRank1Update(const raft::handle_t& handle, math_t L_22_host; raft::update_host(&s_host, s, 1, stream); raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); L_22_host = std::sqrt(L_22_host - s_host); // Check for numeric error with sqrt. If the matrix is not positive definit or diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index 7e0744f98a..717e2c42b2 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -120,7 +120,7 @@ void coalescedReduction(OutType* dots, coalescedReductionKernel <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 3616d54506..d125aa40dd 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -69,7 +69,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err) * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ +#define RAFT_CUBLAS_TRY(call) \ do { \ cublasStatus_t const status = (call); \ if (CUBLAS_STATUS_SUCCESS != status) { \ @@ -84,20 +84,13 @@ inline const char* cublas_error_to_string(cublasStatus_t err) } \ } while (0) -/** FIXME: temporary alias for cuML compatibility */ -#define CUBLAS_CHECK(call) CUBLAS_TRY(call) +// FIXME: Remove after consumers rename +#ifndef CUBLAS_TRY +#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call) +#endif -/** check for cublas runtime API errors but do not assert */ -#define CUBLAS_CHECK_NO_THROW(call) \ - do { \ - cublasStatus_t err = call; \ - if (err != CUBLAS_STATUS_SUCCESS) { \ - CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", \ - #call, \ - err, \ - raft::linalg::detail::cublas_error_to_string(err)); \ - } \ - } while (0) +/** FIXME: remove after cuml rename */ +#define CUBLAS_CHECK(call) CUBLAS_TRY(call) namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 85f2740647..0c94804111 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -68,7 +68,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) * Invokes a cuSOLVER runtime API function call, if the call does not return * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred */ -#define CUSOLVER_TRY(call) \ +#define RAFT_CUSOLVER_TRY(call) \ do { \ cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ @@ -83,20 +83,14 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) } \ } while (0) -/** FIXME: temporary alias for cuML compatibility */ -#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call) +// FIXME: remove after consumer rename +#ifndef CUSOLVER_TRY +#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call) +#endif -//@todo: enable this once logging is enabled -#if 0 -** check for cusolver runtime API errors but do not assert */ -define CUSOLVER_CHECK_NO_THROW(call) \ - do { \ - cusolverStatus_t err = call; \ - if (err != CUSOLVER_STATUS_SUCCESS) { \ - CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::detail::cusolver_error_to_string(err)); \ - } \ - } while (0) +// FIXME: remove after cuml rename +#ifndef CUSOLVER_CHECK +#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call) #endif namespace raft { diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 288d379dac..b67c9d494a 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -40,32 +40,32 @@ void eigDC_legacy(const raft::handle_t& handle, cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - eig_vals, - &lwork)); + RAFT_CUSOLVER_TRY(cusolverDnsyevd_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUSOLVER_TRY(cusolverDnsyevd(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); + RAFT_CUDA_TRY(cudaGetLastError()); auto dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, @@ -101,21 +101,21 @@ void eigDC(const raft::handle_t& handle, cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cusolverDnParams_t dn_params = nullptr; - CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params)); + RAFT_CUSOLVER_TRY(cusolverDnCreateParams(&dn_params)); size_t workspaceDevice = 0; size_t workspaceHost = 0; - CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH, - dn_params, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), - eig_vectors, - static_cast(n_cols), - eig_vals, - &workspaceDevice, - &workspaceHost, - stream)); + RAFT_CUSOLVER_TRY(cusolverDnxsyevd_bufferSize(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + &workspaceDevice, + &workspaceHost, + stream)); rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); rmm::device_scalar d_dev_info(stream); @@ -123,23 +123,23 @@ void eigDC(const raft::handle_t& handle, raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH, - dn_params, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), - eig_vectors, - static_cast(n_cols), - eig_vals, - d_work.data(), - workspaceDevice, - h_work.data(), - workspaceHost, - d_dev_info.data(), - stream)); - - CUDA_CHECK(cudaGetLastError()); - CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params)); + RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + d_work.data(), + workspaceDevice, + h_work.data(), + workspaceHost, + d_dev_info.data(), + stream)); + + RAFT_CUDA_TRY(cudaGetLastError()); + RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " @@ -181,67 +181,67 @@ void eigSelDC(const raft::handle_t& handle, int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - &lwork)); + RAFT_CUSOLVER_TRY(cusolverDnsyevdx_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); + RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); + RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, @@ -286,44 +286,44 @@ void eigJacobi(const raft::handle_t& handle, cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; - CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); - CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); + RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params)); + RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol)); + RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - &lwork, - syevj_params)); + RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + &lwork, + syevj_params)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - d_work.data(), - lwork, - dev_info.data(), - syevj_params, - stream)); + RAFT_CUSOLVER_TRY(cusolverDnsyevj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + dev_info.data(), + syevj_params, + stream)); int executed_sweeps; - CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + RAFT_CUSOLVER_TRY(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); - CUDA_CHECK(cudaGetLastError()); - CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); + RAFT_CUDA_TRY(cudaGetLastError()); + RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params)); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index d5942b7446..959f74ee2b 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -65,7 +65,7 @@ void gemm(const raft::handle_t& handle, int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } @@ -191,7 +191,7 @@ void gemm(const raft::handle_t& handle, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index ac0547e30a..965cd32a57 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -41,7 +41,7 @@ void gemv(const raft::handle_t& handle, { cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); } @@ -139,7 +139,7 @@ void gemv(const raft::handle_t& handle, { cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK( + RAFT_CUBLAS_TRY( cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); } diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 39089473e3..ef2b6cc941 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -130,20 +130,20 @@ int performLanczosIteration(handle_t const& handle, A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot( + RAFT_CUBLAS_TRY(cublasdot( cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy( + RAFT_CUBLAS_TRY(cublasaxpy( cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -165,33 +165,33 @@ int performLanczosIteration(handle_t const& handle, // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), @@ -199,69 +199,69 @@ int performLanczosIteration(handle_t const& handle, cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); + RAFT_CUBLAS_TRY(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2( + RAFT_CUBLAS_TRY(cublasnrm2( cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged @@ -269,7 +269,7 @@ int performLanczosIteration(handle_t const& handle, // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -641,36 +641,36 @@ static int lanczosRestart(handle_t const& handle, V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - iter_new, - iter, - &one, - lanczosVecs_dev, - n, - V_dev, - iter, - &zero, - work_dev, - n, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, @@ -685,11 +685,12 @@ static int lanczosRestart(handle_t const& handle, cudaMemcpyDeviceToDevice, stream)); - CUBLAS_CHECK(cublasnrm2( + RAFT_CUBLAS_TRY(cublasnrm2( cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + RAFT_CUBLAS_TRY( + cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -821,7 +822,7 @@ int computeSmallestEigenvectors(handle_t const& handle, work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -837,10 +838,10 @@ int computeSmallestEigenvectors(handle_t const& handle, // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); value_type_t normQ1; - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos *effIter = 0; @@ -970,21 +971,21 @@ int computeSmallestEigenvectors(handle_t const& handle, CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); @@ -1208,7 +1209,7 @@ int computeLargestEigenvectors(handle_t const& handle, work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1222,10 +1223,10 @@ int computeLargestEigenvectors(handle_t const& handle, // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); value_type_t normQ1; - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos *effIter = 0; @@ -1360,21 +1361,21 @@ int computeLargestEigenvectors(handle_t const& handle, CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 200818fdc3..4facc5e72c 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -39,7 +39,7 @@ void mapImpl( const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index 78a7017c5c..2fa3ae72de 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -88,7 +88,7 @@ void mapThenReduceImpl(OutType* out, const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel <<>>(out, len, neutral, map, op, in, args...); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index c85cfda934..2870d6d072 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -52,25 +52,25 @@ void qrGetQ(const raft::handle_t& handle, int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); rmm::device_uvector tau(k, stream); - CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); rmm::device_scalar devInfo(stream); int Lwork; - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); + RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf( + RAFT_CUSOLVER_TRY(cusolverDngeqrf( cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - CUDA_CHECK(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( + RAFT_CUSOLVER_TRY(cusolverDnorgqr( cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } @@ -98,52 +98,52 @@ void qrGetQR(const raft::handle_t& handle, int m = n_rows, n = n_cols; rmm::device_uvector R_full(m * n, stream); rmm::device_uvector tau(min(m, n), stream); - CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Lwork; rmm::device_scalar devInfo(stream); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( + RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize( cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, - R_full_nrows, - R_full_ncols, - R_full.data(), - R_full_nrows, - tau.data(), - workspace.data(), - Lwork, - devInfo.data(), - stream)); + RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolverH, + R_full_nrows, + R_full_ncols, + R_full.data(), + R_full_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - CUDA_CHECK(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize( cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, - Q_nrows, - Q_ncols, - min(Q_ncols, Q_nrows), - Q, - Q_nrows, - tau.data(), - workspace.data(), - Lwork, - devInfo.data(), - stream)); + RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH, + Q_nrows, + Q_ncols, + min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 43060d0818..7ffcb734f8 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -98,7 +98,7 @@ void subtractDevScalar(math_t* outDev, const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); subtract_dev_scalar_kernel <<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index e14a5b6a50..f83ba83c9b 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,7 @@ void svdQR(const raft::handle_t& handle, bool gen_right_vec, cudaStream_t stream) { + RAFT_USING_RANGE("raft::linalg::svdQR(%d, %d)", n_rows, n_cols); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); @@ -82,7 +84,7 @@ void svdQR(const raft::handle_t& handle, T* d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + RAFT_CUSOLVER_TRY(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); rmm::device_uvector d_work(lwork, stream); char jobu = 'S'; @@ -98,32 +100,32 @@ void svdQR(const raft::handle_t& handle, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd(cusolverH, - jobu, - jobvt, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - d_rwork, - devInfo.data(), - stream)); + RAFT_CUSOLVER_TRY(cusolverDngesvd(cusolverH, + jobu, + jobvt, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + d_rwork, + devInfo.data(), + stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); int dev_info; raft::update_host(&dev_info, devInfo.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); ASSERT(dev_info == 0, "svd.cuh: svd couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -140,6 +142,7 @@ void svdEig(const raft::handle_t& handle, bool gen_left_vec, cudaStream_t stream) { + RAFT_USING_RANGE("raft::linalg::svdEig(%d, %d)", n_rows, n_cols); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); @@ -218,13 +221,14 @@ void svdJacobi(const raft::handle_t& handle, int max_sweeps, cudaStream_t stream) { + RAFT_USING_RANGE("raft::linalg::svdJacobi(%d, %d)", n_rows, n_cols); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; - CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params)); - CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol)); - CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps)); + RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params)); + RAFT_CUSOLVER_TRY(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol)); + RAFT_CUSOLVER_TRY(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps)); int m = n_rows; int n = n_cols; @@ -234,42 +238,42 @@ void svdJacobi(const raft::handle_t& handle, int lwork = 0; int econ = 1; - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - &lwork, - gesvdj_params)); + RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); rmm::device_uvector d_work(lwork, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - devInfo.data(), - gesvdj_params, - stream)); - - CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); + RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); + + RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } /** @@ -349,8 +353,8 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, // form product matrix rmm::device_uvector P_d(m * n, stream); rmm::device_uvector S_mat(k * k, stream); - CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); - CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream); svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream); @@ -365,22 +369,22 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, // calculate percent error const math_t alpha = 1.0, beta = -1.0; rmm::device_uvector A_minus_P(m * n, stream); - CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_N, - m, - n, - &alpha, - A_d, - m, - &beta, - P_d.data(), - m, - A_minus_P.data(), - m, - stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); math_t percent_error = 100.0 * norm_A_minus_P / normA; diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index e84ddd1166..63dbae1c8a 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -47,20 +47,20 @@ void transpose(const raft::handle_t& handle, const math_t alpha = 1.0; const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_n_rows, - out_n_cols, - &alpha, - in, - n_rows, - &beta, - out, - out_n_rows, - out, - out_n_rows, - stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } /** diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 198b9b2b10..d10bc859fe 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -47,7 +47,7 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -122,7 +122,7 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); writeOnlyUnaryOpKernel<<>>(out, len, op); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } }; // end namespace linalg diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index 4b56f3986f..aa0947b3f0 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -59,7 +59,7 @@ void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t } else { argmaxKernel<<>>(in, D, N, out); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } // Utility kernel needed for signFlip. @@ -109,7 +109,7 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) } else { signFlipKernel<<>>(data, D, N); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // end namespace detail diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 709570ae56..cf908c5e6d 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -41,7 +41,7 @@ void copyRows(const m_t* in, const idx_t TPB = 256; cache::get_vecs<<>>( in, n_cols, indices, n_rows_indices, out); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); return; } diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index f5827bf4bd..5ebd61559e 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -286,7 +286,7 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st { cublasHandle_t cublasH = handle.get_cublas_handle(); m_t normval = 0; - CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 38ef59aadf..6998c1f186 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -64,7 +64,7 @@ class buffer_base { if (capacity_ > 0) { data_ = static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); - CUDA_CHECK(cudaStreamSynchronize(stream_)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_)); } } @@ -198,11 +198,11 @@ class buffer_base { { if (stream_ != stream) { cudaEvent_t event; - CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - CUDA_CHECK(cudaEventRecord(event, stream_)); - CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + RAFT_CUDA_TRY(cudaEventRecord(event, stream_)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0)); stream_ = stream; - CUDA_CHECK(cudaEventDestroy(event)); + RAFT_CUDA_TRY(cudaEventDestroy(event)); } } }; // class buffer_base diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 7d31248e7f..71b5465451 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -44,7 +44,7 @@ class default_allocator : public allocator { void* allocate(std::size_t n, cudaStream_t stream) override { void* ptr = nullptr; - CUDA_CHECK(cudaMallocHost(&ptr, n)); + RAFT_CUDA_TRY(cudaMallocHost(&ptr, n)); return ptr; } @@ -52,7 +52,7 @@ class default_allocator : public allocator { { // Must call _NO_THROW here since this is called frequently from object // destructors which are "nothrow" by default - CUDA_CHECK_NO_THROW(cudaFreeHost(p)); + RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh index 0f3b58975e..cdebd650f9 100644 --- a/cpp/include/raft/random/detail/rng_impl.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -505,7 +505,7 @@ class RngImpl { void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) { detail::constFillKernel<<>>(ptr, len, val); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -651,7 +651,7 @@ class RngImpl { rmm::device_uvector workspace(0, stream); sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } raft::scatter(out, in, outIdxPtr, sampledLen, stream); @@ -734,7 +734,7 @@ class RngImpl { break; default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); offset = newOffset; } @@ -766,7 +766,7 @@ class RngImpl { break; default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); offset = newOffset; } }; diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh index 7ad24496ab..9a2eef89d2 100644 --- a/cpp/include/raft/sparse/convert/detail/coo.cuh +++ b/cpp/include/raft/sparse/convert/detail/coo.cuh @@ -71,7 +71,7 @@ void csr_to_coo( csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } }; // end NAMESPACE detail diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh index 241b5730c0..2641fae0b8 100644 --- a/cpp/include/raft/sparse/convert/detail/csr.cuh +++ b/cpp/include/raft/sparse/convert/detail/csr.cuh @@ -57,20 +57,20 @@ void coo_to_csr(const raft::handle_t& handle, auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); rmm::device_uvector dstRows(nnz, stream); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); rmm::device_uvector pBuffer(buffSize, stream); rmm::device_uvector P(nnz, stream); - CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + RAFT_CUSPARSE_TRY(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); raft::sparse::cusparsecoosortByRow( cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); - CUDA_CHECK(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); } /** @@ -176,7 +176,7 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s { rmm::device_uvector row_counts(m, stream); - CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); linalg::coo_degree(rows, nnz, row_counts.data(), stream); diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh index ca4a567355..1f3e170b33 100644 --- a/cpp/include/raft/sparse/convert/detail/dense.cuh +++ b/cpp/include/raft/sparse/convert/detail/dense.cuh @@ -92,18 +92,18 @@ void csr_to_dense(cusparseHandle_t handle, * If we need col-major, use cusparse. */ cusparseMatDescr_t out_mat; - CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat)); - CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&out_mat)); + RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO)); + RAFT_CUSPARSE_TRY(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( + RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2dense( handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); - CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); + RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 29a244a962..e2306686ce 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -80,7 +80,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) * Invokes a cuSparse runtime API function call, if the call does not return * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred */ -#define CUSPARSE_TRY(call) \ +#define RAFT_CUSPARSE_TRY(call) \ do { \ cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ @@ -95,12 +95,19 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) } \ } while (0) -/** FIXME: temporary alias for cuML compatibility */ +// FIXME: Remove after consumer rename +#ifndef CUSPARSE_TRY +#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call) +#endif + +// FIXME: Remove after consumer rename +#ifndef CUSPARSE_CHECK #define CUSPARSE_CHECK(call) CUSPARSE_TRY(call) +#endif //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ +#define RAFT_CUSPARSE_TRY_NO_THROW(call) \ do { \ cusparseStatus_t err = call; \ if (err != CUSPARSE_STATUS_SUCCESS) { \ @@ -111,6 +118,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) } \ } while (0) +// FIXME: Remove after consumer rename +#ifndef CUSPARSE_CHECK_NO_THROW +#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call) +#endif + namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh index ded0b2c36a..ccf9a1dd0a 100644 --- a/cpp/include/raft/sparse/detail/coo.cuh +++ b/cpp/include/raft/sparse/detail/coo.cuh @@ -104,9 +104,11 @@ class COO { void init_arrays(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} @@ -156,7 +158,7 @@ class COO { { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + RAFT_CUDA_TRY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; @@ -165,7 +167,7 @@ class COO { out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } else { out << "Cannot print COO object: Uninitialized or invalid." << std::endl; } diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh index 4c98b27318..62835e3bc2 100644 --- a/cpp/include/raft/sparse/detail/csr.cuh +++ b/cpp/include/raft/sparse/detail/csr.cuh @@ -151,20 +151,20 @@ void weak_cc_batched(Index_* labels, Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel <<>>(labels, N, MAX_LABEL, filter_op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); int n_iters = 0; do { - CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(state->m, false, sizeof(bool), stream)); weak_cc_label_device <<>>( labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); //** Updating m * raft::update_host(&host_m, state->m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); n_iters++; } while (host_m); diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 7527e876ec..ad97e0853a 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -99,8 +99,8 @@ void compute_bin_distance(value_t* out, { rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index e69a292ef1..fe5ce9c67a 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -56,10 +56,10 @@ inline void balanced_coo_pairwise_generalized_spmv( strategy_t strategy, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; @@ -112,10 +112,10 @@ inline void balanced_coo_pairwise_generalized_spmv( write_f write_func, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh index 9b1dfff022..c4e39c11a0 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh @@ -53,16 +53,16 @@ class coo_spmv_strategy { int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); balanced_coo_generalized_spmv_kernel <<>>(strategy, @@ -103,16 +103,16 @@ class coo_spmv_strategy { int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); balanced_coo_generalized_spmv_kernel <<>>(strategy, diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 5ee2d250fb..2a9c5363dd 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -142,8 +142,8 @@ void compute_l2(value_t* out, { rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -190,11 +190,11 @@ void compute_corr(value_t* out, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 207cca7287..1952f19900 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -119,7 +119,7 @@ void build_dendrogram_host(const handle_t& handle, update_host(mst_dst_h.data(), cols, n_edges, stream); update_host(mst_weights_h.data(), data, n_edges, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); std::vector children_h(n_edges * 2); std::vector out_size_h(n_edges); diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh index 6ef619108a..61b72596b5 100644 --- a/cpp/include/raft/sparse/linalg/detail/add.cuh +++ b/cpp/include/raft/sparse/linalg/detail/add.cuh @@ -189,14 +189,14 @@ size_t csr_add_calc_inds(const int* a_ind, dim3 blk(TPB_X, 1, 1); rmm::device_uvector row_counts(m + 1, stream); - CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); csr_add_calc_row_counts_kernel<<>>( a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); @@ -243,7 +243,7 @@ void csr_add_finalize(const int* a_ind, csr_add_kernel<<>>( a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end NAMESPACE detail diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh index 08a140d420..dfbeb09a5b 100644 --- a/cpp/include/raft/sparse/linalg/detail/degree.cuh +++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh @@ -61,7 +61,7 @@ void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream) dim3 blk_rc(TPB_X, 1, 1); coo_degree_kernel<<>>(rows, nnz, results); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh index 742d914951..2ba661c938 100644 --- a/cpp/include/raft/sparse/linalg/detail/norm.cuh +++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh @@ -100,7 +100,7 @@ void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) dim3 blk(TPB_X, 1, 1); csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template @@ -167,7 +167,7 @@ void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) dim3 blk(TPB_X, 1, 1); csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } }; // end NAMESPACE detail diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh index 521b1ea7ec..016dccd161 100644 --- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh @@ -51,7 +51,7 @@ void fit_embedding(const raft::handle_t& handle, rmm::device_uvector eigVecs(n * (n_components + 1), stream); rmm::device_uvector labels(n, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); /** * Raft spectral clustering @@ -105,7 +105,7 @@ void fit_embedding(const raft::handle_t& handle, raft::copy(out, eigVecs.data() + n, n * n_components, stream); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } }; // namespace detail diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh index ea7f2f2fad..85c47ef97b 100644 --- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh @@ -158,7 +158,7 @@ void coo_symmetrize(COO* in, in->n_rows, in->nnz, reduction_op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -284,18 +284,18 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices, // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) rmm::device_uvector row_sizes(n, stream); - CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); rmm::device_uvector row_sizes2(n, stream); - CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); reduce_find_size<<>>( n, k, row_sizes.data(), row_sizes2.data()); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k // Notice we don't do any merging and leave the result as 2*NNZ @@ -318,7 +318,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices, // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h index ae22a93d15..55652e2275 100644 --- a/cpp/include/raft/sparse/linalg/detail/transpose.h +++ b/cpp/include/raft/sparse/linalg/detail/transpose.h @@ -70,39 +70,39 @@ void csr_transpose(cusparseHandle_t handle, { size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, - csr_nrows, - csr_ncols, - nnz, - csr_data, - csr_indptr, - csr_indices, - csc_data, - csc_indptr, - csc_indices, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, - stream)); + RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc_bufferSize(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, + stream)); rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, stream); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, - csr_nrows, - csr_ncols, - nnz, - csr_data, - csr_indptr, - csr_indices, - csc_data, - csc_indptr, - csc_indices, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), - stream)); + RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), + stream)); } }; // end NAMESPACE detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index 5591e15b19..5397b3fb95 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -93,7 +93,7 @@ MST_solver::MST_solver(const raft::han mst_edge_count.set_value_to_zero_async(stream); prev_mst_edge_count.set_value_to_zero_async(stream); - CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream)); // Initially, color holds the vertex id as color auto policy = handle.get_thrust_policy(); diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh index 31ec1eed22..b5d819ebac 100644 --- a/cpp/include/raft/sparse/op/detail/filter.cuh +++ b/cpp/include/raft/sparse/op/detail/filter.cuh @@ -107,18 +107,18 @@ void coo_remove_scalar(const int* rows, rmm::device_uvector ex_scan(n, stream); rmm::device_uvector cur_ex_scan(n, stream); - CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -134,7 +134,7 @@ void coo_remove_scalar(const int* rows, dev_cur_ex_scan.get(), n, scalar); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -151,14 +151,14 @@ void coo_remove_scalar(COO* in, COO* out, T scalar, cudaStream_t stream) rmm::device_uvector row_count_nz(in->n_rows, stream); rmm::device_uvector row_count(in->n_rows, stream); - CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); int out_nnz = @@ -178,7 +178,7 @@ void coo_remove_scalar(COO* in, COO* out, T scalar, cudaStream_t stream) scalar, in->n_rows, stream); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh index 55a8ee2948..a959e4a3f7 100644 --- a/cpp/include/raft/sparse/op/detail/reduce.cuh +++ b/cpp/include/raft/sparse/op/detail/reduce.cuh @@ -102,7 +102,7 @@ template void compute_duplicates_mask( value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); compute_duplicates_diffs_kernel<<>>( rows, cols, mask, nnz); @@ -147,7 +147,7 @@ void max_duplicates(const raft::handle_t& handle, // compute final size value_idx size = 0; raft::update_host(&size, diff.data() + (diff.size() - 1), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); size++; out.allocate(size, m, n, true, stream); diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh index 4fd76a0202..402e8dcce5 100644 --- a/cpp/include/raft/sparse/op/detail/row_op.cuh +++ b/cpp/include/raft/sparse/op/detail/row_op.cuh @@ -67,7 +67,7 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud dim3 blk(TPB_X, 1, 1); csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // namespace detail diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h index 4a09f4af7f..366f37bc46 100644 --- a/cpp/include/raft/sparse/op/detail/slice.h +++ b/cpp/include/raft/sparse/op/detail/slice.h @@ -62,7 +62,7 @@ void csr_row_slice_indptr(value_idx start_row, raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); value_idx s_offset = *start_offset; diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh index 35101f1714..8f420a67f4 100644 --- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh @@ -415,7 +415,7 @@ void connect_components( // compute final size value_idx size = 0; raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); size++; diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index d29fd59a88..efb8d0201d 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -232,7 +232,7 @@ class sparse_knn_t { size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); rmm::device_uvector batch_dists(dense_size, handle.get_stream()); - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); compute_distances(idx_batcher, query_batcher, diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 7f4e4511d2..b7f124c51e 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -124,7 +124,7 @@ void approx_knn_build_index(raft::handle_t& handle, IntType D) { int device; - CUDA_CHECK(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaGetDevice(&device)); faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index 47fc62066d..27a23034c5 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -621,7 +621,7 @@ void fusedL2UnexpKnnImpl(const DataT* x, worksize = sizeof(int32_t) * numMutexes; return; } else { - CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); } } @@ -645,7 +645,7 @@ void fusedL2UnexpKnnImpl(const DataT* x, } else { } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template <<>>( inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -265,7 +265,7 @@ void brute_force_knn_impl( } int device; - CUDA_CHECK(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaGetDevice(&device)); rmm::device_uvector trans(id_ranges->size(), userStream); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); @@ -285,7 +285,7 @@ void brute_force_knn_impl( } // Sync user stream only if using other streams to parallelize query - if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); + if (n_int_streams > 0) RAFT_CUDA_TRY(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { float* out_d_ptr = out_D + (i * k * n); @@ -294,73 +294,72 @@ void brute_force_knn_impl( cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); // // TODO: Enable this once we figure out why it's causing pytest failures in cuml. - // if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && - // (metric == raft::distance::DistanceType::L2Unexpanded || - // metric == raft::distance::DistanceType::L2SqrtUnexpanded //|| - // // metric == raft::distance::DistanceType::L2Expanded || - // // metric == raft::distance::DistanceType::L2SqrtExpanded) - // )) { - // fusedL2Knn(D, - // out_i_ptr, - // out_d_ptr, - // input[i], - // search_items, - // sizes[i], - // n, - // k, - // rowMajorIndex, - // rowMajorQuery, - // stream, - // metric); - // } else { - switch (metric) { - case raft::distance::DistanceType::Haversine: - - ASSERT(D == 2, - "Haversine distance requires 2 dimensions " - "(latitude / longitude)."); - - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); - break; - default: - faiss::MetricType m = build_faiss_metric(metric); - - faiss::gpu::StandardGpuResources gpu_res; - - gpu_res.noTempMemory(); - gpu_res.setDefaultStream(device, stream); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; - args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; - args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); + if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && + (metric == raft::distance::DistanceType::L2Unexpanded || + metric == raft::distance::DistanceType::L2SqrtUnexpanded || + metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded)) { + fusedL2Knn(D, + out_i_ptr, + out_d_ptr, + input[i], + search_items, + sizes[i], + n, + k, + rowMajorIndex, + rowMajorQuery, + stream, + metric); + } else { + switch (metric) { + case raft::distance::DistanceType::Haversine: + + ASSERT(D == 2, + "Haversine distance requires 2 dimensions " + "(latitude / longitude)."); + + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); + break; + default: + faiss::MetricType m = build_faiss_metric(metric); + + faiss::gpu::StandardGpuResources gpu_res; + + gpu_res.noTempMemory(); + gpu_res.setDefaultStream(device, stream); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; + args.vectorsRowMajor = rowMajorIndex; + args.numVectors = sizes[i]; + args.queries = search_items; + args.queriesRowMajor = rowMajorQuery; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); + } } - } - CUDA_CHECK(cudaPeekAtLastError()); - // } + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } // Sync internal streams if used. We don't need to // sync the user stream because we'll already have // fully serial execution. for (int i = 0; i < n_int_streams; i++) { - CUDA_CHECK(cudaStreamSynchronize(internalStreams[i])); + RAFT_CUDA_TRY(cudaStreamSynchronize(internalStreams[i])); } if (input.size() > 1 || translations != nullptr) { @@ -379,7 +378,11 @@ void brute_force_knn_impl( float p = 0.5; // standard l2 if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, + res_D, + n * k, + [p] __device__(float input) { return powf(fabsf(input), p); }, + userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh index 88fa58a4d7..327efe49bb 100644 --- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh @@ -110,7 +110,7 @@ inline void select_k_impl(value_t* inK, select_k_kernel <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } /** diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 18b23bea55..549dd4917c 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -392,9 +392,9 @@ static int chooseNewCentroid(handle_t const& handle, // linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY( + RAFT_CUDA_TRY( cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); @@ -404,16 +404,16 @@ static int chooseNewCentroid(handle_t const& handle, } //} - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, - obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); return 0; } @@ -486,21 +486,21 @@ static int initializeCentroids(handle_t const& handle, dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; // Assign observation vectors to code 0 - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, dists); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { @@ -512,17 +512,17 @@ static int initializeCentroids(handle_t const& handle, CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Recompute minimum distances minDistances2<<>>(n, dists, dists + n, codes, i); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); computeClusterSizes<<>>(n, codes, clusterSizes); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); return 0; } @@ -569,7 +569,7 @@ static int assignCentroids(handle_t const& handle, auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors - CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); // CUDA grid dimensions dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; @@ -581,7 +581,7 @@ static int assignCentroids(handle_t const& handle, gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); computeDistances<<>>(n, d, k, obs, centroids, dists); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Find centroid closest to each observation vector CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); @@ -657,59 +657,59 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + RAFT_CUBLAS_TRY(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::modulus()); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); thrust::gather( thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::divides()); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Divide sums by cluster size to get centroid matrix // @@ -722,7 +722,7 @@ static int updateCentroids(handle_t const& handle, 1}; divideCentroids<<>>(d, k, clusterSizes, centroids); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); return 0; } @@ -829,30 +829,30 @@ int kmeans(handle_t const& handle, CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); *residual_host = thrust::reduce( thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); return 0; } if (n <= k) { thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync( centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm @@ -895,7 +895,7 @@ int kmeans(handle_t const& handle, thrust::device_pointer_cast(clusterSizes + k), 0) - thrust::device_pointer_cast(clusterSizes)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); } // Check for convergence diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 9d1f899d66..0d79904707 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -208,24 +208,24 @@ struct sparse_matrix_t { // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr(&matA, - nrows_, - ncols_, - nnz_, - const_cast(row_offsets_), - const_cast(col_indices_), - const_cast(values_))); + RAFT_CUSPARSE_TRY(cusparsecreatecsr(&matA, + nrows_, + ncols_, + nnz_, + const_cast(row_offsets_), + const_cast(col_indices_), + const_cast(values_))); cusparseDnVecDescr_t vecX; - CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); + RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecX, size_x, x)); cusparseDnVecDescr_t vecY; - CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); + RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecY, size_y, y)); // get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize( + RAFT_CUSPARSE_TRY(cusparsespmv_buffersize( cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); // allocate external buffer: @@ -234,40 +234,40 @@ struct sparse_matrix_t { // finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv( + RAFT_CUSPARSE_TRY(cusparsespmv( cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); // free descriptors: //(TODO: maybe wrap them in a RAII struct?) // - CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); - CUSPARSE_CHECK(cusparseDestroySpMat(matA)); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecY)); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX)); + RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + RAFT_CUSPARSE_TRY(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; - CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); + RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&descr)); if (symmetric) { - CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); + RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); } else { - CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } - CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, - trans, - nrows_, - ncols_, - nnz_, - &alpha, - descr, - values_, - row_offsets_, - col_indices_, - x, - &beta, - y, - stream)); - CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); + RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + RAFT_CUSPARSE_TRY(cusparsecsrmv(cusparse_h, + trans, + nrows_, + ncols_, + nnz_, + &alpha, + descr, + values_, + row_offsets_, + col_indices_, + x, + &beta, + y, + stream)); + RAFT_CUSPARSE_TRY(cusparseDestroyMatDescr(descr)); #endif } @@ -349,7 +349,7 @@ struct laplacian_matrix_t : sparse_matrix_t { if (beta == 0) { CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); } else if (beta != 1) { - CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); } // Apply diagonal matrix @@ -358,7 +358,7 @@ struct laplacian_matrix_t : sparse_matrix_t { dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Apply adjacency matrix // @@ -412,26 +412,26 @@ struct modularity_matrix_t : laplacian_matrix_t { // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot(cublas_h, - n, - laplacian_matrix_t::diagonal_.raw(), - 1, - x, - 1, - &dot_res, - stream)); + RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h, + n, + laplacian_matrix_t::diagonal_.raw(), + 1, + x, + 1, + &dot_res, + stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, - n, - &gamma_, - laplacian_matrix_t::diagonal_.raw(), - 1, - y, - 1, - stream)); + RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h, + n, + &gamma_, + laplacian_matrix_t::diagonal_.raw(), + 1, + y, + 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 0e0e47ddf3..c61b5f1458 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -118,7 +118,7 @@ std::tuple modularity_maximization( // notice that at this point the matrix has already been transposed, so we are scaling // columns scale_obs(nEigVecs, n, eigVecs); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Find partition clustering auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); @@ -160,7 +160,7 @@ void analyzeModularity(handle_t const& handle, vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, csr_m}; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index b52bfcc0d6..5b1478baa9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -152,7 +152,7 @@ void analyzePartition(handle_t const& handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian /// sparse_matrix_t A{handle, graph}; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 44b4af4bdc..a30906de10 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -122,7 +122,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, mean = thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), @@ -130,9 +130,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::make_constant_iterator(mean), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -142,31 +142,31 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::make_constant_iterator(std), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::divides()); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); } // Transpose eigenvector matrix // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); - - CUDA_TRY(cudaMemcpyAsync( + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + RAFT_CUBLAS_TRY(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync( eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); } } @@ -213,17 +213,17 @@ bool construct_indicator(handle_t const& handle, thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(index)); - CHECK_CUDA(stream); + RAFT_CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index e8e6bea4dd..899e378d38 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -71,15 +71,15 @@ void mean( static const int ColsPerBlk = 32; static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); meanKernelRowMajor<<>>(mu, data, D, N); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { meanKernelColMajor<<>>(mu, data, D, N); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index 42351269ea..229eb34a7d 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -118,7 +118,7 @@ void stddev(Type* std, static const int ColsPerBlk = 32; static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); + RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D)); stddevKernelRowMajor<<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( @@ -131,7 +131,7 @@ void stddev(Type* std, } else { stddevKernelColMajor<<>>(std, data, mu, D, N); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -168,7 +168,7 @@ void vars(Type* var, static const int ColsPerBlk = 32; static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); + RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D)); stddevKernelRowMajor<<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( @@ -176,7 +176,7 @@ void vars(Type* var, } else { varsKernelColMajor<<>>(var, data, mu, D, N); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index b7f5cc8ff7..1db504965c 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -70,13 +70,13 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c static const int ColsPerBlk = 32; static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); + RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { sumKernelColMajor<<>>(output, input, D, N); } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace detail diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index 150767992f..d9c69ce519 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -25,7 +25,7 @@ TEST(Raft, Utils) ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); - ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr))); + ASSERT_NO_THROW(RAFT_CUDA_TRY(cudaFree(nullptr))); } } // namespace raft diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index 21d7e9d753..a748b0ef0e 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -62,7 +62,7 @@ void naiveDistanceAdj(bool* dist, static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -127,7 +127,7 @@ class DistanceAdjTest : public ::testing::TestWithParam #include +#include #include #include #include @@ -352,7 +353,7 @@ void naiveDistance(DataType* dist, break; default: FAIL() << "should be here\n"; } - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -410,6 +411,9 @@ class DistanceTest : public ::testing::TestWithParam> { void SetUp() override { + auto testInfo = testing::UnitTest::GetInstance()->current_test_info(); + RAFT_USING_RANGE("test::%s/%s", testInfo->test_suite_name(), testInfo->name()); + raft::random::Rng r(params.seed); int m = params.m; int n = params.n; @@ -453,7 +457,7 @@ class DistanceTest : public ::testing::TestWithParam> { stream, isRowMajor, metric_arg); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 932857c536..68ad220734 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -86,15 +86,15 @@ void naive(cub::KeyValuePair* min, { static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); - CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); auto blks = raft::ceildiv(m, 256); MinAndDistanceReduceOp op; detail::initKernel, int> <<>>(min, m, std::numeric_limits::max(), op); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); naiveKernel, 16> <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); } template @@ -132,7 +132,7 @@ class FusedL2NNTest : public ::testing::TestWithParam> { generateGoldenResult(); raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream); raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: @@ -175,7 +175,7 @@ class FusedL2NNTest : public ::testing::TestWithParam> { Sqrt, true, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } }; @@ -217,7 +217,7 @@ template std::shared_ptr act_h(new KVP[size]); raft::update_host(exp_h.get(), expected, size, stream); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < size; ++i) { auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; @@ -308,7 +308,7 @@ class FusedL2NNDetTest : public FusedL2NNTest { FusedL2NNTest::SetUp(); int m = this->params.m; min1.resize(m, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void TearDown() override { FusedL2NNTest::TearDown(); } diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index dc7de92eb8..1354124d6a 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -27,6 +28,7 @@ namespace raft { TEST(Raft, EigenSolvers) { + RAFT_USING_RANGE("test::EigenSolvers"); using namespace matrix; using index_type = int; using value_type = double; @@ -67,6 +69,7 @@ TEST(Raft, EigenSolvers) TEST(Raft, SpectralSolvers) { + RAFT_USING_RANGE("test::SpectralSolvers"); using namespace matrix; using index_type = int; using value_type = double; diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 698a601e85..81b8bb6c6c 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -38,11 +38,11 @@ TEST(Raft, Handle) handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); h.set_stream(stream); ASSERT_EQ(stream, h.get_stream()); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } TEST(Raft, GetInternalStreams) diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index d983ec1162..4b56a9ad6f 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -38,7 +38,7 @@ typedef labelTest MakeMonotonicTest; TEST_F(MakeMonotonicTest, Result) { cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); int m = 12; @@ -55,7 +55,7 @@ TEST_F(MakeMonotonicTest, Result) make_monotonic(actual.data(), data.data(), m, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare(), stream)); @@ -66,7 +66,7 @@ TEST_F(MakeMonotonicTest, Result) TEST(labelTest, Classlabels) { cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); int n_rows = 6; rmm::device_uvector y_d(n_rows, stream); diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 17b000044e..2b51f4640a 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -47,7 +47,7 @@ class AddTest : public ::testing::TestWithParam> { r.uniform(in2.data(), len, InT(-1.0), InT(1.0), stream); naiveAddElem(out_ref.data(), in1.data(), in2.data(), len); add(out.data(), in1.data(), in2.data(), len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void compare() diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 1d9352bfc1..5e887e0040 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -35,7 +35,7 @@ void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index c833faa0b2..bb62ddced3 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -58,7 +58,7 @@ class BinaryOpTest : public ::testing::TestWithParam x(n, stream); rmm::device_uvector y(n, stream); rmm::device_uvector z(n, stream); - CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); - CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( z.data() + 9, x.data() + 137, diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index 97cb3ecb24..60450695e7 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -36,7 +36,7 @@ void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) static const IdxType TPB = 64; IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 6c7bbd1232..1c3d99a883 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -38,13 +38,13 @@ class CholeskyR1Test : public ::testing::Test { devInfo(handle.get_stream()), workspace(0, handle.get_stream()) { - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); // Allocate workspace solver_handle = handle.get_cusolver_dn_handle(); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf_bufferSize( + RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize( solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork)); int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace @@ -55,7 +55,7 @@ class CholeskyR1Test : public ::testing::Test { workspace.resize(Lwork, stream); } - void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } void testR1Update() { @@ -70,15 +70,15 @@ class CholeskyR1Test : public ::testing::Test { // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, - uplo, - rank, - L_exp.data(), - n_rows, - (math_t*)workspace.data(), - Lwork, - devInfo.data(), - stream)); + RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + stream)); // Incremental Cholesky factorization using rank one updates. raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index 9bb84e1eb7..910e6a2365 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -75,7 +75,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<<>>(out, in, scalar, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -57,11 +57,11 @@ class DivideTest : public ::testing::TestWithParam> { stream, tol, sweeps); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b1e88c91dd..518dce4048 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -92,7 +92,7 @@ class EigSelTest : public ::testing::TestWithParam> { eig_vals.data(), EigVecMemUsage::OVERWRITE_INPUT, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 5ecca16be6..023b04f8ed 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -38,7 +38,7 @@ void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t st static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -76,7 +76,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<<>>(out, in1, in2, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -164,7 +164,7 @@ class EltwiseAddTest : public ::testing::TestWithParam> { r.uniform(in2, len, T(-1.0), T(1.0), stream); naiveAdd(out_ref, in1, in2, len, stream); eltwiseAdd(out, in1, in2, len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index 6231715c8a..da07ed797e 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -80,10 +80,10 @@ class GemmLayoutTest : public ::testing::TestWithParam> { size_t yElems = params.K * params.N; size_t zElems = params.M * params.N; - CUDA_CHECK(cudaMalloc(&X, xElems * sizeof(T))); - CUDA_CHECK(cudaMalloc(&Y, yElems * sizeof(T))); - CUDA_CHECK(cudaMalloc(&refZ, zElems * sizeof(T))); - CUDA_CHECK(cudaMalloc(&Z, zElems * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc(&X, xElems * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc(&Y, yElems * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc(&refZ, zElems * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc(&Z, zElems * sizeof(T))); r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); @@ -109,8 +109,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { void TearDown() override { - CUDA_CHECK(cudaFree(refZ)); - CUDA_CHECK(cudaFree(Z)); + RAFT_CUDA_TRY(cudaFree(refZ)); + RAFT_CUDA_TRY(cudaFree(Z)); } protected: diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 787d9ba415..f79aac9b7f 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -64,7 +64,7 @@ void create_ref(OutType* out_ref, eltwiseAdd(tmp.data(), in1, in2, len, stream); eltwiseAdd(out_ref, tmp.data(), in3, len, stream); scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } template @@ -93,7 +93,7 @@ class MapTest : public ::testing::TestWithParam<<>>(out, in, len, map); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -87,7 +87,7 @@ class MapReduceTest : public ::testing::TestWithParam> { auto len = params.len; r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref.data(), out.data(), in.data(), len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: @@ -133,12 +133,12 @@ class MapGenericReduceTest : public ::testing::Test { protected: MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream()) { - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); } - void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } public: void initInput(InType* input, int n, cudaStream_t stream) diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 3db7c53041..9f2a1ac78f 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -132,7 +132,7 @@ class MatVecOpTest : public ::testing::TestWithParam> params.bcastAlongRows, params.useTwoVectors, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 5f9c6f1ef3..70a68fb542 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -60,7 +60,7 @@ void naiveMatVec(Type* out, IdxType len = N * D; IdxType nblks = raft::ceildiv(len, TPB); naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -105,7 +105,7 @@ void naiveMatVec(Type* out, IdxType nblks = raft::ceildiv(len, TPB); naiveMatVecKernel <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // end namespace linalg diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 2a632d55b2..a4ad0f1d4f 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -45,7 +45,7 @@ class MultiplyTest : public ::testing::TestWithParam> { r.uniform(in.data(), len, T(-1.0), T(1.0), stream); naiveScale(out_ref.data(), in.data(), params.scalar, len, stream); multiplyScalar(out.data(), in.data(), params.scalar, len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 6dae606f18..3fdedc1814 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -67,7 +67,7 @@ void naiveRowNorm( static const int TPB = 64; int nblks = raft::ceildiv(N, TPB); naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -95,7 +95,7 @@ class RowNormTest : public ::testing::TestWithParam> { } else { rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: @@ -130,7 +130,7 @@ void naiveColNorm( static const int TPB = 64; int nblks = raft::ceildiv(D, TPB); naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -159,7 +159,7 @@ class ColNormTest : public ::testing::TestWithParam> { } else { colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 25ee0a7b77..ba354de2f1 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -96,7 +96,7 @@ class ReduceTest : public ::testing::TestWithParam reduceLaunch( dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 82ddfd4661..e74af2c6fe 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -44,7 +44,7 @@ void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cu static const int TPB = 64; int nblks = raft::ceildiv(N, TPB); naiveCoalescedReductionKernel<<>>(dots, data, D, N); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -60,15 +60,15 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t [] __device__(InType v) { return static_cast(v * v); }, stream); cublasHandle_t handle; - CUBLAS_CHECK(cublasCreate(&handle)); + RAFT_CUBLAS_TRY(cublasCreate(&handle)); rmm::device_uvector ones(N, stream); // column vector [1...1] raft::linalg::unaryOp( ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( + RAFT_CUBLAS_TRY(raft::linalg::cublasgemv( handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream)); - CUDA_CHECK(cudaDeviceSynchronize()); - CUBLAS_CHECK(cublasDestroy(handle)); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + RAFT_CUBLAS_TRY(cublasDestroy(handle)); } template @@ -89,7 +89,7 @@ void naiveReduction(OutType* dots, } else { naiveCoalescedReduction(dots, data, N, D, stream); } - CUDA_CHECK(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); } } // end namespace linalg diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index ac387c16bb..6f3671540e 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -61,7 +61,7 @@ class stridedReductionTest : public ::testing::TestWithParam<<>>(out, in1, in2, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -52,7 +52,7 @@ void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cu static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); naiveSubtractScalarKernel<<>>(out, in1, in2, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -96,7 +96,7 @@ class SubtractTest : public ::testing::TestWithParam> { subtractScalar(out.data(), out.data(), T(1), len, stream); subtract(in1.data(), in1.data(), in2.data(), len, stream); subtractScalar(in1.data(), in1.data(), T(1), len, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index 61c2c2e3db..801067dc96 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -91,7 +91,7 @@ class SvdTest : public ::testing::TestWithParam> { true, true, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index fde5599bc1..a63b08e970 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -63,7 +63,7 @@ class TransposeTest : public ::testing::TestWithParam> { transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream); transpose(data.data(), params.n_row, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index ff6723973d..333eebe830 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -59,7 +59,7 @@ class UnaryOpTest : public ::testing::TestWithParam(params.tolerance))); } @@ -91,7 +91,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { auto scalar = this->params.scalar; naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream); unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream); - CUDA_CHECK(cudaStreamSynchronize(this->stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(this->stream)); ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(), this->params.len, diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index 3343389af8..d8ab6fa90a 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -43,7 +43,7 @@ void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStre static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu index 26bfa13148..930c3537e3 100644 --- a/cpp/test/matrix/linewise_op.cu +++ b/cpp/test/matrix/linewise_op.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,49 +25,15 @@ #include "../linalg/matrix_vector_op.cuh" #include "../test_utils.h" -#ifdef NVTX_ENABLED -#include -#endif - namespace raft { namespace matrix { constexpr std::size_t PTR_PADDING = 128; -template -void PUSH_RANGE(rmm::cuda_stream_view stream, const char* name, Args... args) -{ - int length = std::snprintf(nullptr, 0, name, args...); - assert(length >= 0); - auto buf = std::make_unique(length + 1); - std::snprintf(buf.get(), length + 1, name, args...); - stream.synchronize(); -#ifdef NVTX_ENABLED - nvtxRangePushA(buf.get()); -#endif -} -template <> -void PUSH_RANGE(rmm::cuda_stream_view stream, const char* name) -{ - stream.synchronize(); -#ifdef NVTX_ENABLED - nvtxRangePushA(name); -#endif -} - -void POP_RANGE(rmm::cuda_stream_view stream) -{ - stream.synchronize(); -#ifdef NVTX_ENABLED - nvtxRangePop(); -#endif -} - struct LinewiseTestParams { double tolerance; std::size_t workSizeBytes; uint64_t seed; - bool useVanillaMatrixVectorOp; bool checkCorrectness; int inAlignOffset; int outAlignOffset; @@ -91,10 +58,7 @@ struct LinewiseTest : public ::testing::TestWithParam T { return a + b; }; - if (params.useVanillaMatrixVectorOp) - linalg::matrixVectorOp(out, in, vec, lineLen, nLines, true, alongLines, f, stream); - else - matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec); + matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec); } void runLinewiseSum(T* out, @@ -106,10 +70,7 @@ struct LinewiseTest : public ::testing::TestWithParam T { return a + b + c; }; - if (params.useVanillaMatrixVectorOp) - linalg::matrixVectorOp(out, in, vec1, vec2, lineLen, nLines, true, alongLines, f, stream); - else - matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec1, vec2); + matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec1, vec2); } rmm::device_uvector genData(size_t workSizeBytes) @@ -177,19 +138,19 @@ struct LinewiseTest : public ::testing::TestWithParam(params.tolerance)) @@ -197,9 +158,10 @@ struct LinewiseTest : public ::testing::TestWithParam Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 0 /* not used anyway */, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ std::get<0>(ps), /** .checkCorrectness */ true, - /** .inAlignOffset */ std::get<1>(ps), - /** .outAlignOffset */ std::get<2>(ps)}; + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; } }; auto MegabyteParams = TinyParams; struct Megabyte { - typedef std::tuple Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 1024 * 1024, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ std::get<0>(ps), /** .checkCorrectness */ true, - /** .inAlignOffset */ std::get<1>(ps), - /** .outAlignOffset */ std::get<2>(ps)}; + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; } }; -auto GigabyteParams = - ::testing::Combine(::testing::Bool(), ::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2)); +auto GigabyteParams = ::testing::Combine(::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2)); struct Gigabyte { - typedef std::tuple Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 1024 * 1024 * 1024, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ std::get<0>(ps), /** .checkCorrectness */ false, - /** .inAlignOffset */ std::get<1>(ps), - /** .outAlignOffset */ std::get<2>(ps)}; + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; } }; auto TenGigsParams = GigabyteParams; struct TenGigs { - typedef std::tuple Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ std::get<0>(ps), /** .checkCorrectness */ false, - /** .inAlignOffset */ std::get<1>(ps), - /** .outAlignOffset */ std::get<2>(ps)}; + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; } }; diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 7042f5b48d..a1001f3816 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -36,7 +36,7 @@ void naivePower(Type* in, Type* out, int len, cudaStream_t stream) static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -52,7 +52,7 @@ void naiveSqrt(Type* in, Type* out, int len) static const int TPB = 64; int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -89,7 +89,7 @@ template void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) { naiveSignFlipKernel<<>>(in, out, rowCount, colCount); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } template @@ -176,7 +176,7 @@ class MathTest : public ::testing::TestWithParam> { update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream); setSmallValuesZero(in_smallzero.data(), 4, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 6f052f7b46..696ef2dd08 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -63,7 +63,7 @@ class MatrixTest : public ::testing::TestWithParam> { rmm::device_uvector outTrunc(6, stream); truncZeroOrigin(in1.data(), params.n_row, outTrunc.data(), 3, 2, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: @@ -112,7 +112,7 @@ class MatrixCopyRowsTest : public ::testing::Test { indices(n_selected, handle.get_stream()), output(n_cols * n_selected, handle.get_stream()) { - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array @@ -121,7 +121,7 @@ class MatrixCopyRowsTest : public ::testing::Test { thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } - void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } void testCopyRows() { diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 5cfcc910fd..7f4dfb8702 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -28,7 +28,7 @@ namespace device { TEST(Raft, DeviceBufferAlloc) { cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); // no allocation at construction rmm::device_uvector buff(0, stream); ASSERT_EQ(0, buff.size()); @@ -48,8 +48,8 @@ TEST(Raft, DeviceBufferAlloc) ASSERT_EQ(10, buff.size()); buff.release(); ASSERT_EQ(0, buff.size()); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } TEST(Raft, DeviceBufferZeroResize) @@ -64,7 +64,7 @@ TEST(Raft, DeviceBufferZeroResize) rmm::mr::set_current_device_resource(limit_mr.get()); cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); // no allocation at construction rmm::device_uvector buff(10, stream); ASSERT_EQ(10, buff.size()); @@ -83,8 +83,8 @@ TEST(Raft, DeviceBufferZeroResize) rmm::mr::set_current_device_resource(curr_mr); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } } // namespace device diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index aadf05285c..d645ffa0e0 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -28,7 +28,7 @@ TEST(Raft, HostBuffer) { auto alloc = std::make_shared(); cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); // no allocation at construction buffer buff(alloc, stream); ASSERT_EQ(0, buff.size()); @@ -48,8 +48,8 @@ TEST(Raft, HostBuffer) ASSERT_EQ(10, buff.size()); buff.release(); ASSERT_EQ(0, buff.size()); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } TEST(Raft, DeviceToHostBuffer) @@ -57,13 +57,13 @@ TEST(Raft, DeviceToHostBuffer) auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } } // namespace host diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 90a6d7bd87..88b34cfb85 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -137,15 +137,15 @@ class MSTTest : public ::testing::TestWithParam mst_dst(2 * v - 2, handle.get_stream()); rmm::device_uvector color(v, handle.get_stream()); - CUDA_CHECK(cudaMemsetAsync(mst_src.data(), - std::numeric_limits::max(), - mst_src.size() * sizeof(vertex_t), - handle.get_stream())); - CUDA_CHECK(cudaMemsetAsync(mst_dst.data(), - std::numeric_limits::max(), - mst_dst.size() * sizeof(vertex_t), - handle.get_stream())); - CUDA_CHECK( + RAFT_CUDA_TRY(cudaMemsetAsync(mst_src.data(), + std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), + handle.get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync(mst_dst.data(), + std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), + handle.get_stream())); + RAFT_CUDA_TRY( cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream())); vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index 69dc146486..eb5e8c0ae5 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -91,7 +91,7 @@ class RngTest : public ::testing::TestWithParam> { stats(2, stream) { data.resize(params.len, stream); - CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); } protected: @@ -119,10 +119,10 @@ class RngTest : public ::testing::TestWithParam> { meanKernel<<>>( stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void getExpectedMeanVar(T meanvar[2]) @@ -375,7 +375,7 @@ TEST(Rng, MeanError) int len = num_samples * num_experiments; cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); rmm::device_uvector data(len, stream); rmm::device_uvector mean_result(num_experiments, stream); @@ -399,7 +399,7 @@ TEST(Rng, MeanError) std::vector h_std_result(num_experiments); update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream); update_host(h_std_result.data(), std_result.data(), num_experiments, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error @@ -415,7 +415,7 @@ TEST(Rng, MeanError) ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); // std::cout << "mean_res:" << h_mean_result << "\n"; } @@ -428,7 +428,7 @@ class ScaledBernoulliTest : public ::testing::Test { protected: void SetUp() override { - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); Rng r(42); r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream); } @@ -464,7 +464,7 @@ class BernoulliTest : public ::testing::Test { { Rng r(42); r.bernoulli(data.data(), len, T(0.5), stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void trueFalseCheck() @@ -515,7 +515,7 @@ class RngNormalTableTest : public ::testing::TestWithParam <<>>(stats.data(), data.data(), len); update_host(h_stats, stats.data(), 2, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); h_stats[0] /= len; h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void getExpectedMeanVar(T meanvar[2]) diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index f0331b7746..d5701e1708 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -77,7 +77,7 @@ class RngTest : public ::testing::TestWithParam> { stats(2, stream) { data.resize(params.len, stream); - CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream)); } protected: @@ -94,10 +94,10 @@ class RngTest : public ::testing::TestWithParam> { meanKernel<<>>( stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void getExpectedMeanVar(float meanvar[2]) diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index a681bbb07d..710049cbce 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -77,7 +77,7 @@ class SWoRTest : public ::testing::TestWithParam> { params.len, stream); update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index 57e7414861..df138e2bdb 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -127,7 +127,7 @@ class ConnectComponentsTest false, false); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); // The sum of edges for both MST runs should be n_rows - 1 final_edges = output_mst.n_edges + mst_coo.n_edges; diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index dd774c1d79..3b69c9240c 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -68,9 +68,9 @@ TEST_P(SortedCOOToCSR, Result) rmm::device_uvector in(nnz, stream); rmm::device_uvector exp(4, stream); rmm::device_uvector out(4, stream); - CUDA_CHECK(cudaMemsetAsync(in.data(), 0, in.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(exp.data(), 0, exp.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(in.data(), 0, in.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(exp.data(), 0, exp.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(int), stream)); raft::update_device(in.data(), in_h, nnz, stream); raft::update_device(exp.data(), exp_h, 4, stream); diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 768397e617..c8c593790f 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -98,7 +98,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam out_ref_h = params.out_ref_h; update_device(out_ref.data(), out_ref_h.data(), out_ref_h.size(), stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CHECK_CUDA(cudaStreamSynchronize(stream)); } void SetUp() override { - CUSPARSE_CHECK(cusparseCreate(&handle)); + RAFT_CUSPARSE_TRY(cusparseCreate(&handle)); make_data(); @@ -103,8 +103,8 @@ class CSRToDenseTest : public ::testing::TestWithParam in_rows(5, stream); rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); - CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); @@ -68,7 +68,7 @@ TEST_P(COODegree, Result) ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; @@ -85,9 +85,9 @@ TEST_P(COODegreeNonzero, Result) rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); rmm::device_uvector in_vals(5, stream); - CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); @@ -98,7 +98,7 @@ TEST_P(COODegreeNonzero, Result) ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index d0de8705ab..2c8a91b8b8 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -222,7 +222,7 @@ class SparseDistanceCOOSPMVTest run_spmv(); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); } void compare() diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 8538c9cf39..f4f346561c 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -92,7 +92,7 @@ class SparseDistanceTest pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); } void compare() diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index efa399acdb..77c66e2133 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -101,7 +101,7 @@ TEST_P(COORemoveZeros, Result) ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); free(out_vals_ref_h); delete[] in_h_rows; diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index d4f57a381f..389e8c4b9c 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -101,7 +101,7 @@ class SparseKNNTest : public ::testing::TestWithParamrows(), out->cols(), out->vals(), out->nnz, sum.data()); sum_h = sum.value(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void TearDown() override { delete out; } diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index d506e3e54d..81e6dc4768 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -118,7 +118,7 @@ double compute_rand_index(T* firstClusterArray, // allocating and initializing memory for a and b in the GPU rmm::device_uvector arr_buf(2, stream); - CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); // kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; @@ -133,10 +133,10 @@ double compute_rand_index(T* firstClusterArray, // synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); // error handling - CUDA_CHECK(cudaGetLastError()); + RAFT_CUDA_TRY(cudaGetLastError()); // denominator uint64_t nChooseTwo = size * (size - 1) / 2; @@ -188,7 +188,7 @@ class LinkageTest : public ::testing::TestWithParam> { params.c, params.n_clusters); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream); } diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b2658f37ca..0a0864ce15 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -51,7 +51,7 @@ TEST_P(COOSort, Result) params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); rmm::device_uvector in_rows(params.nnz, stream); rmm::device_uvector in_cols(params.nnz, stream); @@ -85,7 +85,7 @@ TEST_P(COOSort, Result) delete[] in_cols_h; delete[] verify_h; - CUDA_CHECK(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } INSTANTIATE_TEST_CASE_P(SparseSortTest, COOSort, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index b9fc868cf0..9c766d2d05 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -111,7 +111,7 @@ class SparseSymmetrizeTest out.rows(), out.cols(), out.vals(), out.nnz, sum.data()); sum_h = sum.value(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: @@ -173,7 +173,7 @@ TEST_P(COOSymmetrize, Result) [] __device__(int row, int col, float val, float trans) { return val + trans; }, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index 0a1680badc..00f83254c3 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -176,7 +176,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { d_ref_D.data(), d_ref_I.data()); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); // Allocate predicted arrays rmm::device_uvector d_pred_I(n * k, handle.get_stream()); @@ -188,7 +188,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { raft::spatial::knn::rbc_knn_query( handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The // indices may or may not match exactly, depending upon the ordering which // can be nondeterministic. @@ -273,7 +273,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { translations, metric); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); // Allocate predicted arrays rmm::device_uvector d_pred_I(n * k, handle.get_stream()); @@ -284,7 +284,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { raft::spatial::knn::rbc_all_knn_query( handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight); - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The // indices may or may not match exactly, depending upon the ordering which // can be nondeterministic. diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index e48a3c6657..078d5e0eec 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -77,7 +77,7 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx, raft::update_host(act_idx_h.get(), actual_idx, size, stream); raft::update_host(exp_dist_h.get(), expected_dist, size, stream); raft::update_host(act_dist_h.get(), actual_dist, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { auto idx = i * cols + j; // row major assumption! @@ -168,7 +168,7 @@ class FusedL2KNNTest : public ::testing::TestWithParam { gpu_res.noTempMemory(); int device; - CUDA_CHECK(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaGetDevice(&device)); gpu_res.setDefaultStream(device, handle_.get_stream()); faiss::gpu::GpuDistanceParams args; diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index bff7665f83..e268dc0c55 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -94,7 +94,7 @@ class HaversineKNNTest : public ::testing::Test { k, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void SetUp() override { basicTest(); } diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 49e5aaab4b..8ab33745f3 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -120,16 +120,16 @@ class KNNTest : public ::testing::TestWithParam { distances_.resize(rows_ * k_, stream); search_labels_.resize(rows_, stream); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream)); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); - CUDA_CHECK( + RAFT_CUDA_TRY(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); + RAFT_CUDA_TRY( cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream)); - CUDA_CHECK( + RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream)); + RAFT_CUDA_TRY( cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream)); std::vector row_major_input; @@ -149,7 +149,7 @@ class KNNTest : public ::testing::TestWithParam { raft::copy(input_.data(), input_ptr, rows_ * cols_, stream); raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream); raft::copy(search_labels_.data(), labels_ptr, rows_, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } private: diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu index 69f6a47978..5069b4f256 100644 --- a/cpp/test/spatial/selection.cu +++ b/cpp/test/spatial/selection.cu @@ -108,7 +108,7 @@ class SparseSelectionTest k, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void compare() diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 6a76a289d7..8f2e2ecef1 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -78,7 +78,7 @@ class MeanCenterTest : public ::testing::TestWithParam> { vars_act.resize(cols, stream); r.normal(data.data(), len, params.mean, params.stddev, stream); stdVarSGtest(data.data(), stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } void stdVarSGtest(T* data, cudaStream_t stream) diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index ecb1171ea5..82766f6109 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -62,7 +62,7 @@ class SumTest : public ::testing::TestWithParam> { raft::update_device(data.data(), data_h, len, stream); sum(sum_act.data(), data.data(), cols, rows, false, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index 58b9ae42ae..f2573f132b 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -91,7 +91,7 @@ testing::AssertionResult devArrMatch( std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < size; ++i) { auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; @@ -108,7 +108,7 @@ testing::AssertionResult devArrMatch( { std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < size; ++i) { auto act = act_h.get()[i]; if (!eq_compare(expected, act)) { @@ -132,7 +132,7 @@ testing::AssertionResult devArrMatch(const T* expected, std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { auto idx = i * cols + j; // row major assumption! @@ -154,7 +154,7 @@ testing::AssertionResult devArrMatch( size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { auto idx = i * cols + j; // row major assumption! @@ -185,7 +185,7 @@ testing::AssertionResult devArrMatchHost( { std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { @@ -217,7 +217,7 @@ testing::AssertionResult diagonalMatch( size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { if (i != j) continue; @@ -244,20 +244,20 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) /** @} */ /** time the function call 'func' using cuda events */ -#define TIMEIT_LOOP(ms, count, func) \ - do { \ - cudaEvent_t start, stop; \ - CUDA_CHECK(cudaEventCreate(&start)); \ - CUDA_CHECK(cudaEventCreate(&stop)); \ - CUDA_CHECK(cudaEventRecord(start)); \ - for (int i = 0; i < count; ++i) { \ - func; \ - } \ - CUDA_CHECK(cudaEventRecord(stop)); \ - CUDA_CHECK(cudaEventSynchronize(stop)); \ - ms = 0.f; \ - CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); \ - ms /= args.runs; \ +#define TIMEIT_LOOP(ms, count, func) \ + do { \ + cudaEvent_t start, stop; \ + RAFT_CUDA_TRY(cudaEventCreate(&start)); \ + RAFT_CUDA_TRY(cudaEventCreate(&stop)); \ + RAFT_CUDA_TRY(cudaEventRecord(start)); \ + for (int i = 0; i < count; ++i) { \ + func; \ + } \ + RAFT_CUDA_TRY(cudaEventRecord(stop)); \ + RAFT_CUDA_TRY(cudaEventSynchronize(stop)); \ + ms = 0.f; \ + RAFT_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); \ + ms /= args.runs; \ } while (0) inline std::vector read_csv(std::string filename, bool skip_first_n_columns = 1) diff --git a/python/setup.py b/python/setup.py index b10ca783b0..f5b1e8bace 100644 --- a/python/setup.py +++ b/python/setup.py @@ -133,8 +133,13 @@ def remove_flags(compiler, *flags): ) except Exception: pass + # Full optimization self.compiler.compiler_so.append("-O3") + + # Ignore deprecation declaration warnings + self.compiler.compiler_so.append("-Wno-deprecated-declarations") + # No debug symbols, full optimization, no '-Wstrict-prototypes' warning remove_flags( self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"