Skip to content

Commit

Permalink
Improve cudf::cuda_error (#10630)
Browse files Browse the repository at this point in the history
Closes  #10553

Improves `cudf::cuda_error` in two aspects:
1. Add a cudaError_t member to `cudf::cuda_error` and corresponding error_code() function that returns the error code
2. Breaks down `cuda::cuda_error` as `sticky_cuda_error` and `cudart_error`. `sticky_cuda_error` refers to fatal error on device.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #10630
  • Loading branch information
sperlingxx authored Apr 14, 2022
1 parent 03e84ef commit 22a6679
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 12 deletions.
43 changes: 32 additions & 11 deletions cpp/include/cudf/utilities/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,20 @@ struct logic_error : public std::logic_error {
* @brief Exception thrown when a CUDA error is encountered.
*/
struct cuda_error : public std::runtime_error {
cuda_error(std::string const& message) : std::runtime_error(message) {}
cuda_error(std::string const& message, cudaError_t const& error)
: std::runtime_error(message), _cudaError(error)
{
}

public:
cudaError_t error_code() const { return _cudaError; }

protected:
cudaError_t _cudaError;
};

struct fatal_cuda_error : public cuda_error {
using cuda_error::cuda_error;
};
/** @} */

Expand Down Expand Up @@ -101,9 +114,20 @@ namespace detail {

inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
{
throw cudf::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" +
std::to_string(line) + ": " + std::to_string(error) + " " +
cudaGetErrorName(error) + " " + cudaGetErrorString(error)});
// Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
// call doesn't return with cudaSuccess.
cudaGetLastError();
auto const last = cudaGetLastError();
auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
std::to_string(line) + ": " + std::to_string(error) + " " +
cudaGetErrorName(error) + " " + cudaGetErrorString(error)};
// Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error.
// between two calls.
if (error == last && last == cudaDeviceSynchronize()) {
throw fatal_cuda_error{"Fatal " + msg, error};
} else {
throw cuda_error{msg, error};
}
}
} // namespace detail
} // namespace cudf
Expand All @@ -115,13 +139,10 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
* cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
* exception detailing the CUDA error that occurred
*/
#define CUDF_CUDA_TRY(call) \
do { \
cudaError_t const status = (call); \
if (cudaSuccess != status) { \
cudaGetLastError(); \
cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); \
} \
#define CUDF_CUDA_TRY(call) \
do { \
cudaError_t const status = (call); \
if (cudaSuccess != status) { cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); } \
} while (0);

/**
Expand Down
5 changes: 4 additions & 1 deletion cpp/include/cudf_test/cudf_gtest.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -120,6 +120,9 @@ struct TypeList<Types<TYPES...>> {
#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \
EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg)

#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \
EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg)

/**
* @brief test macro to be expected as no exception.
* The testing is same with EXPECT_NO_THROW() in gtest.
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/error/error_handling_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ TEST(CudaTryTest, Error)
CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
"cudaErrorLaunchFailure unspecified launch failure");
}

TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }

TEST(CudaTryTest, TryCatch)
Expand Down

0 comments on commit 22a6679

Please sign in to comment.