rapidsai · BradReesWork · Jun 16, 2020 · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## New Features
 - PR #7: Migrating cuml comms -> raft comms_t
+- PR #15: add exception based error handling macros
 
 ## Improvements
 - PR #13: Add RMM_INCLUDE and RMM_LIBRARY options to allow linking to non-conda RMM

diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
@@ -44,18 +44,42 @@
 #include <cuda_runtime.h>
 
 #include <raft/cudart_utils.h>
+#include <raft/error.hpp>
 
-#define NCCL_CHECK(call)                                                       \
-  do {                                                                         \
-    ncclResult_t status = call;                                                \
-    ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \
-           ncclGetErrorString(status));                                        \
-  } while (0)
+namespace raft {
+
+/**
+ * @brief Exception thrown when a NCCL error is encountered.
+ */
+struct nccl_error : public raft::exception {
+  explicit nccl_error(char const *const message) : raft::exception(message) {}
+  explicit nccl_error(std::string const &message) : raft::exception(message) {}
+};
+
+}  // namespace raft
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define NCCL_TRY(call)                                                        \
+  do {                                                                        \
+    ncclResult_t const status = (call);                                       \
+    if (ncclSuccess != status) {                                              \
+      std::string msg{};                                                      \
+      SET_ERROR_MSG(msg,                                                      \
+                    "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
+                    #call, status, ncclGetErrorString(status));               \
+      throw raft::nccl_error(msg);                                            \
+    }                                                                         \
+  } while (0);
 
 #define NCCL_CHECK_NO_THROW(call)                         \
   do {                                                    \
     ncclResult_t status = call;                           \
-    if (status != ncclSuccess) {                          \
+    if (ncclSuccess != status) {                          \
       printf("NCCL call='%s' failed. Reason:%s\n", #call, \
              ncclGetErrorString(status));                 \
     }                                                     \
@@ -65,8 +89,6 @@ namespace raft {
 namespace comms {
 
 static size_t get_datatype_size(const datatype_t datatype) {
-  size_t ret = -1;
-
   switch (datatype) {
     case datatype_t::CHAR:
       return sizeof(char);
@@ -85,7 +107,7 @@ static size_t get_datatype_size(const datatype_t datatype) {
     case datatype_t::FLOAT64:
       return sizeof(double);
     default:
-      throw "Unsupported";
+      RAFT_FAIL("Unsupported datatype.");
   }
 }
 
@@ -145,13 +167,13 @@ class std_comms : public comms_iface {
             const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
-      ucp_worker_(ucp_worker),
-      ucp_eps_(eps),
+      stream_(stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      device_allocator_(device_allocator),
-      stream_(stream),
-      next_request_id_(0) {
+      ucp_worker_(ucp_worker),
+      ucp_eps_(eps),
+      next_request_id_(0),
+      device_allocator_(device_allocator) {
     initialize();
   };
 
@@ -165,10 +187,10 @@ class std_comms : public comms_iface {
             const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
+      stream_(stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      device_allocator_(device_allocator),
-      stream_(stream) {
+      device_allocator_(device_allocator) {
     initialize();
   };
 
@@ -324,29 +346,28 @@ class std_comms : public comms_iface {
 
   void allreduce(const void *sendbuff, void *recvbuff, size_t count,
                  datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count,
-                             get_nccl_datatype(datatype), get_nccl_op(op),
-                             nccl_comm_, stream));
+    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
+                           get_nccl_datatype(datatype), get_nccl_op(op),
+                           nccl_comm_, stream));
   }
 
   void bcast(void *buff, size_t count, datatype_t datatype, int root,
              cudaStream_t stream) const {
-    NCCL_CHECK(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype),
-                             root, nccl_comm_, stream));
+    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
+                           nccl_comm_, stream));
   }
 
   void reduce(const void *sendbuff, void *recvbuff, size_t count,
               datatype_t datatype, op_t op, int root,
               cudaStream_t stream) const {
-    NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count,
-                          get_nccl_datatype(datatype), get_nccl_op(op), root,
-                          nccl_comm_, stream));
+    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
+                        get_nccl_op(op), root, nccl_comm_, stream));
   }
 
   void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
                  datatype_t datatype, cudaStream_t stream) const {
-    NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount,
-                             get_nccl_datatype(datatype), nccl_comm_, stream));
+    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
+                           get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
   void allgatherv(const void *sendbuf, void *recvbuf, const size_t recvcounts[],
@@ -356,7 +377,7 @@ class std_comms : public comms_iface {
     //Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_CHECK(ncclBroadcast(
+      NCCL_TRY(ncclBroadcast(
         sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
         recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
         stream));
@@ -365,9 +386,9 @@ class std_comms : public comms_iface {
 
   void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
                      datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                                 get_nccl_datatype(datatype), get_nccl_op(op),
-                                 nccl_comm_, stream));
+    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
+                               get_nccl_datatype(datatype), get_nccl_op(op),
+                               nccl_comm_, stream));
   }
 
   status_t sync_stream(cudaStream_t stream) const {

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
@@ -16,96 +16,70 @@
 
 #pragma once
 
+#include <raft/error.hpp>
+
 #include <cuda_runtime.h>
+
 #include <execinfo.h>
-#include <chrono>
 #include <cstdio>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <utility>
+
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
 
 namespace raft {
 
-/** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(const exception& src) noexcept
-    : std::exception(), msg_(src.what()) {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(const std::string _msg) noexcept
-    : std::exception(), msg_(std::move(_msg)) {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  const char* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept {
-#ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];  // NOLINT
-    auto depth = backtrace(stack, kMaxStackDepth);
-    std::ostringstream oss;
-    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-    char** strings = backtrace_symbols(stack, depth);
-    if (strings == nullptr) {
-      oss << "But no stack trace could be found!" << std::endl;
-      msg_ += oss.str();
-      return;
-    }
-    ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) {
-      oss << "#" << i << " in " << strings[i] << std::endl;
-    }
-    free(strings);
-    msg_ += oss.str();
-#endif  // __GNUC__
-  }
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ */
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
 };
 
-/** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                        \
-  do {                                                                         \
-    std::string msg;                                                           \
-    char errMsg[2048]; /* NOLINT */                                            \
-    std::snprintf(errMsg, sizeof(errMsg),                                      \
-                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                             \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
-    msg += errMsg;                                                             \
-    throw raft::exception(msg);                                                \
-  } while (0)
+}  // namespace raft
 
-/** macro to check for a conditional and assert on failure */
-#define ASSERT(check, fmt, ...)              \
-  do {                                       \
-    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
+/**
+ * @brief Error checking macro for CUDA runtime API functions.
+ *
+ * Invokes a CUDA runtime API function call, if the call does not return
+ * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
+ * exception detailing the CUDA error that occurred
+ *
+ */
+#define CUDA_TRY(call)                                                        \
+  do {                                                                        \
+    cudaError_t const status = call;                                          \
+    if (status != cudaSuccess) {                                              \
+      cudaGetLastError();                                                     \
+      std::string msg{};                                                      \
+      SET_ERROR_MSG(                                                          \
+        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
+        cudaGetErrorName(status), cudaGetErrorString(status));                \
+      throw raft::cuda_error(msg);                                            \
+    }                                                                         \
   } while (0)
 
-/** check for cuda runtime API errors and assert accordingly */
-#define CUDA_CHECK(call)                                               \
-  do {                                                                 \
-    cudaError_t status = call;                                         \
-    ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \
-           cudaGetErrorString(status));                                \
-  } while (0)
+/**
+ * @brief Debug macro to check for CUDA errors
+ *
+ * In a non-release build, this macro will synchronize the specified stream
+ * before error checking. In both release and non-release builds, this macro
+ * checks for any pending CUDA errors from previous calls. If an error is
+ * reported, an exception is thrown detailing the CUDA error that occurred.
+ *
+ * The intent of this macro is to provide a mechanism for synchronous and
+ * deterministic execution for debugging asynchronous CUDA execution. It should
+ * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
+ * asynchronous kernel launch.
+ */
+#ifndef NDEBUG
+#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream));
+#else
+#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
+#endif
+
+/** FIXME: temporary alias for cuML compatibility */
+#define CUDA_CHECK(call) CUDA_TRY(call)
 
 ///@todo: enable this only after we have added logging support in raft
 // /**
@@ -114,13 +88,15 @@ class exception : public std::exception {
 //  */
 #define CUDA_CHECK_NO_THROW(call)                                         \
   do {                                                                    \
-    cudaError_t status = call;                                            \
-    if (status != cudaSuccess) {                                          \
+    cudaError_t const status = call;                                      \
+    if (cudaSuccess != status) {                                          \
       printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
              __FILE__, __LINE__, cudaGetErrorString(status));             \
     }                                                                     \
   } while (0)
 
+namespace raft {
+
 /** helper method to get max usable shared mem per block parameter */
 inline int get_shared_memory_per_block() {
   int dev_id;
@@ -211,4 +187,4 @@ void print_device_vector(const char* variable_name, const T* devMem,
 }
 /** @} */
 
-};  // namespace raft
+}  // namespace raft