Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add error check utilities #15

Merged
merged 30 commits into from
Jun 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a928468
copy error.hpp from cuDF, add license statement, and initial update
seunghwak Jun 3, 2020
328462f
add CUML_EXPECTS, CUML_FAIL, CUGRAPH_EXPECTS, and CUGRAPH_FAIL
seunghwak Jun 3, 2020
187e12a
add NCCL_TRY
seunghwak Jun 3, 2020
4ce8f37
fix compile/clang-tidy errors
seunghwak Jun 3, 2020
086abd3
fix an error in a comment
seunghwak Jun 3, 2020
4f72257
add CUSPARSE_TRY
seunghwak Jun 3, 2020
a428c6e
add CURAND_TRY
seunghwak Jun 3, 2020
b9cee2b
address clang-tidy warnings
seunghwak Jun 3, 2020
b373267
update change log
seunghwak Jun 3, 2020
54bdc8e
resolve error conflicts
seunghwak Jun 3, 2020
e471f1d
clang-format fixes
seunghwak Jun 3, 2020
035dc00
another try to make clang-format happy
seunghwak Jun 3, 2020
f8455b9
Merge branch 'branch-0.15' of github.com:rapidsai/raft into fea_ext_e…
seunghwak Jun 5, 2020
2566b24
move common error handling utilities from cuda_utils.h to error.hpp
seunghwak Jun 10, 2020
3656125
update raft error classes to inherit raft::exception (instead of std:…
seunghwak Jun 10, 2020
0e62cea
move macros out from the raft namespace
seunghwak Jun 10, 2020
55922cb
remove CUML(GRAPH)_EXPECTS(FAIL)
seunghwak Jun 11, 2020
acd5824
update RAFT_EXPECTS and RAFT_FAIL
seunghwak Jun 11, 2020
4a48b57
compile error fix (namespace)
seunghwak Jun 11, 2020
059f1ec
minor fixes to RAFT_EXPECTS(FAIL)
seunghwak Jun 11, 2020
125911c
move error check macros from error.hpp to relevant headers
seunghwak Jun 11, 2020
d3192f4
clang-format
seunghwak Jun 11, 2020
ec0cf97
cosmetic updates
seunghwak Jun 11, 2020
f8f8d32
cosmetic updates
seunghwak Jun 11, 2020
c3f153d
stifle some warnings
seunghwak Jun 11, 2020
85c9b7d
clang-format error
seunghwak Jun 11, 2020
6d9e392
fix unused location_prefix in error handling macro
seunghwak Jun 16, 2020
4ebc0af
remove NCCL_CHECK (replaced with NCCL_TRY)
seunghwak Jun 16, 2020
851b401
clang-format
seunghwak Jun 16, 2020
07a51a4
another clang format
seunghwak Jun 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## New Features
- PR #7: Migrating cuml comms -> raft comms_t
- PR #15: add exception based error handling macros

## Improvements
- PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM
Expand Down
83 changes: 52 additions & 31 deletions cpp/include/raft/comms/std_comms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,42 @@
#include <cuda_runtime.h>

#include <raft/cudart_utils.h>
#include <raft/error.hpp>

#define NCCL_CHECK(call) \
do { \
ncclResult_t status = call; \
ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \
ncclGetErrorString(status)); \
} while (0)
namespace raft {

/**
* @brief Exception thrown when a NCCL error is encountered.
*/
struct nccl_error : public raft::exception {
explicit nccl_error(char const *const message) : raft::exception(message) {}
explicit nccl_error(std::string const &message) : raft::exception(message) {}
};

} // namespace raft

/**
* @brief Error checking macro for NCCL runtime API functions.
*
* Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
* exception detailing the NCCL error that occurred
*/
#define NCCL_TRY(call) \
do { \
ncclResult_t const status = (call); \
if (ncclSuccess != status) { \
std::string msg{}; \
SET_ERROR_MSG(msg, \
"NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
#call, status, ncclGetErrorString(status)); \
throw raft::nccl_error(msg); \
} \
} while (0);

#define NCCL_CHECK_NO_THROW(call) \
do { \
ncclResult_t status = call; \
if (status != ncclSuccess) { \
if (ncclSuccess != status) { \
printf("NCCL call='%s' failed. Reason:%s\n", #call, \
ncclGetErrorString(status)); \
} \
Expand All @@ -65,8 +89,6 @@ namespace raft {
namespace comms {

static size_t get_datatype_size(const datatype_t datatype) {
size_t ret = -1;

switch (datatype) {
case datatype_t::CHAR:
return sizeof(char);
Expand All @@ -85,7 +107,7 @@ static size_t get_datatype_size(const datatype_t datatype) {
case datatype_t::FLOAT64:
return sizeof(double);
default:
throw "Unsupported";
RAFT_FAIL("Unsupported datatype.");
}
}

Expand Down Expand Up @@ -145,13 +167,13 @@ class std_comms : public comms_iface {
const std::shared_ptr<mr::device::allocator> device_allocator,
cudaStream_t stream)
: nccl_comm_(nccl_comm),
ucp_worker_(ucp_worker),
ucp_eps_(eps),
stream_(stream),
num_ranks_(num_ranks),
rank_(rank),
device_allocator_(device_allocator),
stream_(stream),
next_request_id_(0) {
ucp_worker_(ucp_worker),
ucp_eps_(eps),
next_request_id_(0),
device_allocator_(device_allocator) {
initialize();
};

Expand All @@ -165,10 +187,10 @@ class std_comms : public comms_iface {
const std::shared_ptr<mr::device::allocator> device_allocator,
cudaStream_t stream)
: nccl_comm_(nccl_comm),
stream_(stream),
num_ranks_(num_ranks),
rank_(rank),
device_allocator_(device_allocator),
stream_(stream) {
device_allocator_(device_allocator) {
initialize();
};

Expand Down Expand Up @@ -324,29 +346,28 @@ class std_comms : public comms_iface {

void allreduce(const void *sendbuff, void *recvbuff, size_t count,
datatype_t datatype, op_t op, cudaStream_t stream) const {
NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count,
get_nccl_datatype(datatype), get_nccl_op(op),
nccl_comm_, stream));
NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
get_nccl_datatype(datatype), get_nccl_op(op),
nccl_comm_, stream));
}

void bcast(void *buff, size_t count, datatype_t datatype, int root,
cudaStream_t stream) const {
NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype),
root, nccl_comm_, stream));
NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
nccl_comm_, stream));
}

void reduce(const void *sendbuff, void *recvbuff, size_t count,
datatype_t datatype, op_t op, int root,
cudaStream_t stream) const {
NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count,
get_nccl_datatype(datatype), get_nccl_op(op), root,
nccl_comm_, stream));
NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
get_nccl_op(op), root, nccl_comm_, stream));
}

void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
datatype_t datatype, cudaStream_t stream) const {
NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount,
get_nccl_datatype(datatype), nccl_comm_, stream));
NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
get_nccl_datatype(datatype), nccl_comm_, stream));
}

void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[],
Expand All @@ -356,7 +377,7 @@ class std_comms : public comms_iface {
//Listing 1 on page 4.
for (int root = 0; root < num_ranks_; ++root) {
size_t dtype_size = get_datatype_size(datatype);
NCCL_CHECK(ncclBroadcast(
NCCL_TRY(ncclBroadcast(
sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
stream));
Expand All @@ -365,9 +386,9 @@ class std_comms : public comms_iface {

void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
datatype_t datatype, op_t op, cudaStream_t stream) const {
NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount,
get_nccl_datatype(datatype), get_nccl_op(op),
nccl_comm_, stream));
NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
get_nccl_datatype(datatype), get_nccl_op(op),
nccl_comm_, stream));
}

status_t sync_stream(cudaStream_t stream) const {
Expand Down
136 changes: 56 additions & 80 deletions cpp/include/raft/cudart_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,96 +16,70 @@

#pragma once

#include <raft/error.hpp>

#include <cuda_runtime.h>

#include <execinfo.h>
#include <chrono>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <utility>

///@todo: enable once logging has been enabled in raft
//#include "logger.hpp"

namespace raft {

/** base exception class for the whole of raft */
class exception : public std::exception {
public:
/** default ctor */
explicit exception() noexcept : std::exception(), msg_() {}

/** copy ctor */
exception(const exception& src) noexcept
: std::exception(), msg_(src.what()) {
collect_call_stack();
}

/** ctor from an input message */
explicit exception(const std::string _msg) noexcept
: std::exception(), msg_(std::move(_msg)) {
collect_call_stack();
}

/** get the message associated with this exception */
const char* what() const noexcept override { return msg_.c_str(); }

private:
/** message associated with this exception */
std::string msg_;

/** append call stack info to this exception's message for ease of debug */
// Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
void collect_call_stack() noexcept {
#ifdef __GNUC__
constexpr int kMaxStackDepth = 64;
void* stack[kMaxStackDepth]; // NOLINT
auto depth = backtrace(stack, kMaxStackDepth);
std::ostringstream oss;
oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
char** strings = backtrace_symbols(stack, depth);
if (strings == nullptr) {
oss << "But no stack trace could be found!" << std::endl;
msg_ += oss.str();
return;
}
///@todo: support for demangling of C++ symbol names
for (int i = 0; i < depth; ++i) {
oss << "#" << i << " in " << strings[i] << std::endl;
}
free(strings);
msg_ += oss.str();
#endif // __GNUC__
}
/**
* @brief Exception thrown when a CUDA error is encountered.
*/
struct cuda_error : public raft::exception {
explicit cuda_error(char const* const message) : raft::exception(message) {}
explicit cuda_error(std::string const& message) : raft::exception(message) {}
};

/** macro to throw a runtime error */
#define THROW(fmt, ...) \
do { \
std::string msg; \
char errMsg[2048]; /* NOLINT */ \
std::snprintf(errMsg, sizeof(errMsg), \
"exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
msg += errMsg; \
std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \
msg += errMsg; \
throw raft::exception(msg); \
} while (0)
} // namespace raft

/** macro to check for a conditional and assert on failure */
#define ASSERT(check, fmt, ...) \
do { \
if (!(check)) THROW(fmt, ##__VA_ARGS__); \
/**
* @brief Error checking macro for CUDA runtime API functions.
*
* Invokes a CUDA runtime API function call, if the call does not return
* cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
* exception detailing the CUDA error that occurred
*
*/
#define CUDA_TRY(call) \
do { \
cudaError_t const status = call; \
if (status != cudaSuccess) { \
cudaGetLastError(); \
std::string msg{}; \
SET_ERROR_MSG( \
msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
cudaGetErrorName(status), cudaGetErrorString(status)); \
throw raft::cuda_error(msg); \
} \
} while (0)

/** check for cuda runtime API errors and assert accordingly */
#define CUDA_CHECK(call) \
do { \
cudaError_t status = call; \
ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \
cudaGetErrorString(status)); \
} while (0)
/**
* @brief Debug macro to check for CUDA errors
*
* In a non-release build, this macro will synchronize the specified stream
* before error checking. In both release and non-release builds, this macro
* checks for any pending CUDA errors from previous calls. If an error is
* reported, an exception is thrown detailing the CUDA error that occurred.
*
* The intent of this macro is to provide a mechanism for synchronous and
* deterministic execution for debugging asynchronous CUDA execution. It should
* be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
* asynchronous kernel launch.
*/
#ifndef NDEBUG
#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream));
#else
#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
#endif

/** FIXME: temporary alias for cuML compatibility */
#define CUDA_CHECK(call) CUDA_TRY(call)

///@todo: enable this only after we have added logging support in raft
// /**
Expand All @@ -114,13 +88,15 @@ class exception : public std::exception {
// */
#define CUDA_CHECK_NO_THROW(call) \
do { \
cudaError_t status = call; \
if (status != cudaSuccess) { \
cudaError_t const status = call; \
if (cudaSuccess != status) { \
printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
__FILE__, __LINE__, cudaGetErrorString(status)); \
} \
} while (0)

namespace raft {

/** helper method to get max usable shared mem per block parameter */
inline int get_shared_memory_per_block() {
int dev_id;
Expand Down Expand Up @@ -211,4 +187,4 @@ void print_device_vector(const char* variable_name, const T* devMem,
}
/** @} */

}; // namespace raft
} // namespace raft
Loading