From 22a6679b5f5d36530a59c4cbfe1acfd530c2711b Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 14 Apr 2022 16:09:52 +0800 Subject: [PATCH] Improve cudf::cuda_error (#10630) Closes #10553 Improves `cudf::cuda_error` in two aspects: 1. Add a cudaError_t member to `cudf::cuda_error` and corresponding error_code() function that returns the error code 2. Breaks down `cuda::cuda_error` as `sticky_cuda_error` and `cudart_error`. `sticky_cuda_error` refers to fatal error on device. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/10630 --- cpp/include/cudf/utilities/error.hpp | 43 +++++++++++++++++++------- cpp/include/cudf_test/cudf_gtest.hpp | 5 ++- cpp/tests/error/error_handling_test.cu | 1 + 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 8be1a7e3a32..8f6190bbaf7 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -46,7 +46,20 @@ struct logic_error : public std::logic_error { * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public std::runtime_error { - cuda_error(std::string const& message) : std::runtime_error(message) {} + cuda_error(std::string const& message, cudaError_t const& error) + : std::runtime_error(message), _cudaError(error) + { + } + + public: + cudaError_t error_code() const { return _cudaError; } + + protected: + cudaError_t _cudaError; +}; + +struct fatal_cuda_error : public cuda_error { + using cuda_error::cuda_error; }; /** @} */ @@ -101,9 +114,20 @@ namespace detail { inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) { - throw cudf::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); + // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second + // call doesn't return with cudaSuccess. + cudaGetLastError(); + auto const last = cudaGetLastError(); + auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; + // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. + // between two calls. + if (error == last && last == cudaDeviceSynchronize()) { + throw fatal_cuda_error{"Fatal " + msg, error}; + } else { + throw cuda_error{msg, error}; + } } } // namespace detail } // namespace cudf @@ -115,13 +139,10 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an * exception detailing the CUDA error that occurred */ -#define CUDF_CUDA_TRY(call) \ - do { \ - cudaError_t const status = (call); \ - if (cudaSuccess != status) { \ - cudaGetLastError(); \ - cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); \ - } \ +#define CUDF_CUDA_TRY(call) \ + do { \ + cudaError_t const status = (call); \ + if (cudaSuccess != status) { cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); } \ } while (0); /** diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index d078bf90a8a..7bd704a288d 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -120,6 +120,9 @@ struct TypeList> { #define CUDA_EXPECT_THROW_MESSAGE(x, msg) \ EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg) +#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg) + /** * @brief test macro to be expected as no exception. * The testing is same with EXPECT_NO_THROW() in gtest. diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 4327a8b694b..bde8ccc6de7 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -39,6 +39,7 @@ TEST(CudaTryTest, Error) CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), "cudaErrorLaunchFailure unspecified launch failure"); } + TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); } TEST(CudaTryTest, TryCatch)