From c8b9e6cc08c2759c183eb0d04115b0fc66b53978 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Mon, 11 Apr 2022 17:29:08 +0800 Subject: [PATCH 01/11] enrich cuDF cuda_error Signed-off-by: sperlingxx --- cpp/include/cudf/utilities/error.hpp | 44 +++++++++++++++++++------- cpp/include/cudf_test/cudf_gtest.hpp | 10 ++++-- cpp/tests/error/error_handling_test.cu | 17 +++++----- 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 8be1a7e3a32..d95079c5610 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -46,7 +46,25 @@ struct logic_error : public std::logic_error { * @brief Exception thrown when a CUDA error is encountered. */ struct cuda_error : public std::runtime_error { - cuda_error(std::string const& message) : std::runtime_error(message) {} + cuda_error(std::string const& message, cudaError_t const& error) + : std::runtime_error(message), _cudaError(error) + { + } + cudaError_t error_code() { return _cudaError; } + + protected: + cudaError_t _cudaError; +}; + +struct cudart_error : public cuda_error { + cudart_error(std::string const& message, cudaError_t const& error) : cuda_error(message, error) {} +}; + +struct sticky_cuda_error : public cuda_error { + sticky_cuda_error(std::string const& message, cudaError_t const& error) + : cuda_error(message, error) + { + } }; /** @} */ @@ -101,9 +119,16 @@ namespace detail { inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) { - throw cudf::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); + cudaGetLastError(); + auto const last = cudaGetLastError(); + auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; + if (error == last && last == cudaDeviceSynchronize()) { + throw sticky_cuda_error{"Sticky " + msg, error}; + } else { + throw cudart_error{msg, error}; + } } } // namespace detail } // namespace cudf @@ -115,13 +140,10 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an * exception detailing the CUDA error that occurred */ -#define CUDF_CUDA_TRY(call) \ - do { \ - cudaError_t const status = (call); \ - if (cudaSuccess != status) { \ - cudaGetLastError(); \ - cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); \ - } \ +#define CUDF_CUDA_TRY(call) \ + do { \ + cudaError_t const status = (call); \ + if (cudaSuccess != status) { cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); } \ } while (0); /** diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index d078bf90a8a..1c6dc7209bf 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -117,8 +117,14 @@ struct TypeList> { #define CUDF_EXPECT_THROW_MESSAGE(x, msg) \ EXPECT_THROW_MESSAGE(x, cudf::logic_error, "cuDF failure at:", msg) -#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg) +#define CUDART_EXPECT_THROW_MESSAGE(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::cudart_error, "CUDA error encountered at:", msg) + +#define STICKY_CUDA_EXPECT_THROW_MESSAGE(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::sticky_cuda_error, "Sticky CUDA error encountered at:", msg) + +#define STICKY_CUDA_EXPECT_THROW_MESSAGE_1(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "Sticky CUDA error encountered at:", msg) /** * @brief test macro to be expected as no exception. diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 4327a8b694b..315c5234a40 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -36,15 +36,16 @@ TEST(ExpectsTest, TryCatch) TEST(CudaTryTest, Error) { - CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), - "cudaErrorLaunchFailure unspecified launch failure"); + CUDART_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), + "cudaErrorLaunchFailure unspecified launch failure"); } + TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); } TEST(CudaTryTest, TryCatch) { - CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation), - "cudaErrorMemoryAllocation out of memory"); + CUDART_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation), + "cudaErrorMemoryAllocation out of memory"); } TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); } @@ -67,7 +68,7 @@ TEST(StreamCheck, FailedKernel) #ifdef NDEBUG stream.synchronize(); #endif - EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error); + EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cudart_error); } TEST(StreamCheck, CatchFailedKernel) @@ -78,9 +79,9 @@ TEST(StreamCheck, CatchFailedKernel) #ifndef NDEBUG stream.synchronize(); #endif - CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()), - "cudaErrorInvalidConfiguration " - "invalid configuration argument"); + CUDART_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()), + "cudaErrorInvalidConfiguration " + "invalid configuration argument"); } #ifndef NDEBUG From df095e314ce586578eb21d28508501839bf99458 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Mon, 11 Apr 2022 18:04:08 +0800 Subject: [PATCH 02/11] update year range --- cpp/include/cudf_test/cudf_gtest.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index 1c6dc7209bf..a3326a5f8f9 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 9c61429d773487118c5cf3925b67936e584623b6 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Tue, 12 Apr 2022 14:32:06 +0800 Subject: [PATCH 03/11] update --- cpp/include/cudf/utilities/error.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index d95079c5610..505fe15702b 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -57,14 +57,11 @@ struct cuda_error : public std::runtime_error { }; struct cudart_error : public cuda_error { - cudart_error(std::string const& message, cudaError_t const& error) : cuda_error(message, error) {} + using cuda_error::cuda_error; }; -struct sticky_cuda_error : public cuda_error { - sticky_cuda_error(std::string const& message, cudaError_t const& error) - : cuda_error(message, error) - { - } +struct fatal_cuda_error : public cuda_error { + using cuda_error::cuda_error; }; /** @} */ @@ -119,13 +116,17 @@ namespace detail { inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) { + // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second + // call doesn't return with cudaSuccess. cudaGetLastError(); auto const last = cudaGetLastError(); auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; + // Calls cudaDeviceSynchronize to make sure that there is no other asynchronize error occurs + // between two calls. if (error == last && last == cudaDeviceSynchronize()) { - throw sticky_cuda_error{"Sticky " + msg, error}; + throw fatal_cuda_error{"Sticky " + msg, error}; } else { throw cudart_error{msg, error}; } From a4837fb7ad4b6112f3d290b1c5602007091be727 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Tue, 12 Apr 2022 17:52:22 +0800 Subject: [PATCH 04/11] update --- cpp/include/cudf/utilities/error.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 505fe15702b..64b40aac425 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -50,6 +50,8 @@ struct cuda_error : public std::runtime_error { : std::runtime_error(message), _cudaError(error) { } + + public: cudaError_t error_code() { return _cudaError; } protected: From 51bf915c11d31cb05fed33f910e5a04102f6ce56 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 16:53:36 +0800 Subject: [PATCH 05/11] add --- cpp/include/cudf/utilities/error.hpp | 6 +++--- cpp/include/cudf_test/cudf_gtest.hpp | 11 ++++------- cpp/tests/error/error_handling_test.cu | 16 ++++++++-------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 64b40aac425..8249f20f66b 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -52,7 +52,7 @@ struct cuda_error : public std::runtime_error { } public: - cudaError_t error_code() { return _cudaError; } + cudaError_t error_code() const { return _cudaError; } protected: cudaError_t _cudaError; @@ -128,9 +128,9 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l // Calls cudaDeviceSynchronize to make sure that there is no other asynchronize error occurs // between two calls. if (error == last && last == cudaDeviceSynchronize()) { - throw fatal_cuda_error{"Sticky " + msg, error}; + throw fatal_cuda_error{"Fatal " + msg, error}; } else { - throw cudart_error{msg, error}; + throw cuda_error{msg, error}; } } } // namespace detail diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index a3326a5f8f9..7bd704a288d 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -117,14 +117,11 @@ struct TypeList> { #define CUDF_EXPECT_THROW_MESSAGE(x, msg) \ EXPECT_THROW_MESSAGE(x, cudf::logic_error, "cuDF failure at:", msg) -#define CUDART_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::cudart_error, "CUDA error encountered at:", msg) +#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg) -#define STICKY_CUDA_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::sticky_cuda_error, "Sticky CUDA error encountered at:", msg) - -#define STICKY_CUDA_EXPECT_THROW_MESSAGE_1(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "Sticky CUDA error encountered at:", msg) +#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \ + EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg) /** * @brief test macro to be expected as no exception. diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 315c5234a40..bde8ccc6de7 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -36,16 +36,16 @@ TEST(ExpectsTest, TryCatch) TEST(CudaTryTest, Error) { - CUDART_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), - "cudaErrorLaunchFailure unspecified launch failure"); + CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), + "cudaErrorLaunchFailure unspecified launch failure"); } TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); } TEST(CudaTryTest, TryCatch) { - CUDART_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation), - "cudaErrorMemoryAllocation out of memory"); + CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation), + "cudaErrorMemoryAllocation out of memory"); } TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); } @@ -68,7 +68,7 @@ TEST(StreamCheck, FailedKernel) #ifdef NDEBUG stream.synchronize(); #endif - EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cudart_error); + EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error); } TEST(StreamCheck, CatchFailedKernel) @@ -79,9 +79,9 @@ TEST(StreamCheck, CatchFailedKernel) #ifndef NDEBUG stream.synchronize(); #endif - CUDART_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()), - "cudaErrorInvalidConfiguration " - "invalid configuration argument"); + CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()), + "cudaErrorInvalidConfiguration " + "invalid configuration argument"); } #ifndef NDEBUG From 413923997e6d4a7d1c2709ca17ec3db0d3014617 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 17:12:36 +0800 Subject: [PATCH 06/11] with JNI --- cpp/include/cudf/utilities/error.hpp | 4 - .../java/ai/rapids/cudf/CudaException.java | 161 +++++++++++++++++- java/src/main/native/include/jni_utils.hpp | 73 ++++---- java/src/main/native/src/CudaJni.cpp | 4 +- java/src/main/native/src/RmmJni.cpp | 6 +- .../test/java/ai/rapids/cudf/CudaTest.java | 17 +- 6 files changed, 209 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 8249f20f66b..07874ae6ece 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -58,10 +58,6 @@ struct cuda_error : public std::runtime_error { cudaError_t _cudaError; }; -struct cudart_error : public cuda_error { - using cuda_error::cuda_error; -}; - struct fatal_cuda_error : public cuda_error { using cuda_error::cuda_error; }; diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2d862b47ef8..2cad2c09979 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,9 +30,168 @@ public class CudaException extends RuntimeException { CudaException(String message) { super(message); + isFatal = message.startsWith("Fatal"); + cudaError = extractCudaError(message); } CudaException(String message, Throwable cause) { super(message, cause); + isFatal = message.startsWith("Fatal"); + cudaError = extractCudaError(message); + } + + /** + * Returns whether this CudaError is fatal or not. + * + * Fatal errors leave the process in an inconsistent state and any further CUDA work will return + * the same error. To continue using CUDA, the process must be terminated and relaunched. + */ + public boolean isFatal() { + return isFatal; + } + + public final CudaError cudaError; + + private final boolean isFatal; + + private static CudaError extractCudaError(String msg) { + int startIdx = msg.indexOf('['); + int endIdx = msg.indexOf(']'); + return CudaError.valueOf(msg.substring(startIdx + 1, endIdx)); + } + + /** + * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. + */ + public enum CudaError { + cudaErrorInvalidValue(1), + cudaErrorMemoryAllocation(2), + cudaErrorInitializationError(3), + cudaErrorCudartUnloading(4), + cudaErrorProfilerDisabled(5), + cudaErrorProfilerNotInitialized(6), + cudaErrorProfilerAlreadyStarted(7), + cudaErrorProfilerAlreadyStopped(8), + cudaErrorInvalidConfiguration(9), + cudaErrorInvalidPitchValue(12), + cudaErrorInvalidSymbol(13), + cudaErrorInvalidHostPointer(16), + cudaErrorInvalidDevicePointer(17), + cudaErrorInvalidTexture(18), + cudaErrorInvalidTextureBinding(19), + cudaErrorInvalidChannelDescriptor(20), + cudaErrorInvalidMemcpyDirection(21), + cudaErrorAddressOfConstant(22), + cudaErrorTextureFetchFailed(23), + cudaErrorTextureNotBound(24), + cudaErrorSynchronizationError(25), + cudaErrorInvalidFilterSetting(26), + cudaErrorInvalidNormSetting(27), + cudaErrorMixedDeviceExecution(28), + cudaErrorNotYetImplemented(31), + cudaErrorMemoryValueTooLarge(32), + cudaErrorStubLibrary(34), + cudaErrorInsufficientDriver(35), + cudaErrorCallRequiresNewerDriver(36), + cudaErrorInvalidSurface(37), + cudaErrorDuplicateVariableName(43), + cudaErrorDuplicateTextureName(44), + cudaErrorDuplicateSurfaceName(45), + cudaErrorDevicesUnavailable(46), + cudaErrorIncompatibleDriverContext(49), + cudaErrorMissingConfiguration(52), + cudaErrorPriorLaunchFailure(53), + cudaErrorLaunchMaxDepthExceeded(65), + cudaErrorLaunchFileScopedTex(66), + cudaErrorLaunchFileScopedSurf(67), + cudaErrorSyncDepthExceeded(68), + cudaErrorLaunchPendingCountExceeded(69), + cudaErrorInvalidDeviceFunction(98), + cudaErrorNoDevice(100), + cudaErrorInvalidDevice(101), + cudaErrorDeviceNotLicensed(102), + cudaErrorSoftwareValidityNotEstablished(103), + cudaErrorStartupFailure(127), + cudaErrorInvalidKernelImage(200), + cudaErrorDeviceUninitialized(201), + cudaErrorMapBufferObjectFailed(205), + cudaErrorUnmapBufferObjectFailed(206), + cudaErrorArrayIsMapped(207), + cudaErrorAlreadyMapped(208), + cudaErrorNoKernelImageForDevice(209), + cudaErrorAlreadyAcquired(210), + cudaErrorNotMapped(211), + cudaErrorNotMappedAsArray(212), + cudaErrorNotMappedAsPointer(213), + cudaErrorECCUncorrectable(214), + cudaErrorUnsupportedLimit(215), + cudaErrorDeviceAlreadyInUse(216), + cudaErrorPeerAccessUnsupported(217), + cudaErrorInvalidPtx(218), + cudaErrorInvalidGraphicsContext(219), + cudaErrorNvlinkUncorrectable(220), + cudaErrorJitCompilerNotFound(221), + cudaErrorUnsupportedPtxVersion(222), + cudaErrorJitCompilationDisabled(223), + cudaErrorUnsupportedExecAffinity(224), + cudaErrorInvalidSource(300), + cudaErrorFileNotFound(301), + cudaErrorSharedObjectSymbolNotFound(302), + cudaErrorSharedObjectInitFailed(303), + cudaErrorOperatingSystem(304), + cudaErrorInvalidResourceHandle(400), + cudaErrorIllegalState(401), + cudaErrorSymbolNotFound(500), + cudaErrorNotReady(600), + cudaErrorIllegalAddress(700), + cudaErrorLaunchOutOfResources(701), + cudaErrorLaunchTimeout(702), + cudaErrorLaunchIncompatibleTexturing(703), + cudaErrorPeerAccessAlreadyEnabled(704), + cudaErrorPeerAccessNotEnabled(705), + cudaErrorSetOnActiveProcess(708), + cudaErrorContextIsDestroyed(709), + cudaErrorAssert(710), + cudaErrorTooManyPeers(711), + cudaErrorHostMemoryAlreadyRegistered(712), + cudaErrorHostMemoryNotRegistered(713), + cudaErrorHardwareStackError(714), + cudaErrorIllegalInstruction(715), + cudaErrorMisalignedAddress(716), + cudaErrorInvalidAddressSpace(717), + cudaErrorInvalidPc(718), + cudaErrorLaunchFailure(719), + cudaErrorCooperativeLaunchTooLarge(720), + cudaErrorNotPermitted(800), + cudaErrorNotSupported(801), + cudaErrorSystemNotReady(802), + cudaErrorSystemDriverMismatch(803), + cudaErrorCompatNotSupportedOnDevice(804), + cudaErrorMpsConnectionFailed(805), + cudaErrorMpsRpcFailure(806), + cudaErrorMpsServerNotReady(807), + cudaErrorMpsMaxClientsReached(808), + cudaErrorMpsMaxConnectionsReached(809), + cudaErrorStreamCaptureUnsupported(900), + cudaErrorStreamCaptureInvalidated(901), + cudaErrorStreamCaptureMerge(902), + cudaErrorStreamCaptureUnmatched(903), + cudaErrorStreamCaptureUnjoined(904), + cudaErrorStreamCaptureIsolation(905), + cudaErrorStreamCaptureImplicit(906), + cudaErrorCapturedEvent(907), + cudaErrorStreamCaptureWrongThread(908), + cudaErrorTimeout(909), + cudaErrorGraphExecUpdateFailure(910), + cudaErrorExternalDevice(911), + cudaErrorUnknown(999), + cudaErrorApiFailureBase(10000); + + final int code; + + CudaError(int errorCode) { + this.code = errorCode; + } + } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index a45716a89b3..ebaa1e90b5e 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -733,41 +733,6 @@ class native_jstringArray { } }; -/** - * @brief create a cuda exception from a given cudaError_t - */ -inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { - jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); - if (ex_class == NULL) { - return NULL; - } - jmethodID ctor_id = - env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/Throwable;)V"); - if (ctor_id == NULL) { - return NULL; - } - - jstring msg = env->NewStringUTF(cudaGetErrorString(status)); - if (msg == NULL) { - return NULL; - } - - jobject ret = env->NewObject(ex_class, ctor_id, msg, cause); - return (jthrowable)ret; -} - -inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { - if (cudaSuccess != cuda_status) { - // Clear the last error so it does not propagate. - cudaGetLastError(); - jthrowable jt = cuda_exception(env, cuda_status); - if (jt != NULL) { - env->Throw(jt); - throw jni_exception("CUDA ERROR"); - } - } -} - } // namespace jni } // namespace cudf @@ -790,19 +755,35 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { JNI_THROW_NEW(env, class_name, message, ret_val) \ } +// Throw a new exception only if one is not pending then always return with the specified value +#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, fatal, ret_val) \ + do { \ + if (env->ExceptionOccurred()) { \ + return ret_val; \ + } \ + jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); \ + const char *e_name = cudaGetErrorName(e.error_code()); \ + std::string what = std::string(fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + e_name + \ + "]: " + (e.what() == nullptr ? "" : e.what()); \ + env->ThrowNew(ex_class, what.c_str()); \ + return ret_val; \ + } while (0) + #define JNI_CUDA_TRY(env, ret_val, call) \ - { \ + do { \ cudaError_t internal_cuda_status = (call); \ if (cudaSuccess != internal_cuda_status) { \ - /* Clear the last error so it does not propagate.*/ \ - cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \ - if (jt != NULL) { \ - env->Throw(jt); \ - } \ + cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__); \ return ret_val; \ } \ - } + } while (0) + +#define JNI_CUDA_CHECK(env, cuda_status) \ + do { \ + if (cudaSuccess != cuda_status) { \ + cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \ + } \ + } while (0) #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ @@ -831,6 +812,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ + catch (const cudf::cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ + } \ + catch (const cudf::fatal_cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, true, ret_val); \ + } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 9862c3bface..caa48b00e77 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -44,7 +44,7 @@ void auto_set_device(JNIEnv *env) { if (Cudf_device != cudaInvalidDeviceId) { if (Thread_device != Cudf_device) { cudaError_t cuda_status = cudaSetDevice(Cudf_device); - jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); Thread_device = Cudf_device; } } @@ -53,7 +53,7 @@ void auto_set_device(JNIEnv *env) { /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); - jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); } } // namespace jni diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index ce3e6ffb285..41f1c23ac11 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -327,11 +327,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j jstring jpath, jlong pool_size) { try { // make sure the CUDA device is setup in the context - cudaError_t cuda_status = cudaFree(0); - cudf::jni::jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cudaFree(0)); int device_id; - cuda_status = cudaGetDevice(&device_id); - cudf::jni::jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cudaGetDevice(&device_id)); bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 8905c2edd56..f5af2b02fc5 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; public class CudaTest { @@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() { assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId); } + @Test + public void testCudaException() { + assertThrows(CudaException.class, () -> { + try { + Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); + } catch (CudaException ex) { + assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); + assertFalse(ex.isFatal()); + throw ex; + } + } + ); + } } From 3209291813e6c0698a6cbf942c87defe17138462 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 17:21:18 +0800 Subject: [PATCH 07/11] fix --- java/src/main/native/src/CudaJni.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index caa48b00e77..3af58605864 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -41,19 +41,23 @@ void set_cudf_device(int device) { * is using the same device. */ void auto_set_device(JNIEnv *env) { - if (Cudf_device != cudaInvalidDeviceId) { - if (Thread_device != Cudf_device) { - cudaError_t cuda_status = cudaSetDevice(Cudf_device); - JNI_CUDA_CHECK(env, cuda_status); - Thread_device = Cudf_device; + try { + if (Cudf_device != cudaInvalidDeviceId) { + if (Thread_device != Cudf_device) { + JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device)); + Thread_device = Cudf_device; + } } } + CATCH_STD(env, ); } /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { - cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); - JNI_CUDA_CHECK(env, cuda_status); + try { + JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size())); + } + CATCH_STD(env, ); } } // namespace jni From 5a45016c1225e5e1a414043f9101caa1c8da5ab4 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 17:34:23 +0800 Subject: [PATCH 08/11] revert JNI --- .../java/ai/rapids/cudf/CudaException.java | 161 +----------------- java/src/main/native/include/jni_utils.hpp | 73 ++++---- java/src/main/native/src/CudaJni.cpp | 18 +- java/src/main/native/src/RmmJni.cpp | 6 +- 4 files changed, 55 insertions(+), 203 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2cad2c09979..2d862b47ef8 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,168 +30,9 @@ public class CudaException extends RuntimeException { CudaException(String message) { super(message); - isFatal = message.startsWith("Fatal"); - cudaError = extractCudaError(message); } CudaException(String message, Throwable cause) { super(message, cause); - isFatal = message.startsWith("Fatal"); - cudaError = extractCudaError(message); - } - - /** - * Returns whether this CudaError is fatal or not. - * - * Fatal errors leave the process in an inconsistent state and any further CUDA work will return - * the same error. To continue using CUDA, the process must be terminated and relaunched. - */ - public boolean isFatal() { - return isFatal; - } - - public final CudaError cudaError; - - private final boolean isFatal; - - private static CudaError extractCudaError(String msg) { - int startIdx = msg.indexOf('['); - int endIdx = msg.indexOf(']'); - return CudaError.valueOf(msg.substring(startIdx + 1, endIdx)); - } - - /** - * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. - */ - public enum CudaError { - cudaErrorInvalidValue(1), - cudaErrorMemoryAllocation(2), - cudaErrorInitializationError(3), - cudaErrorCudartUnloading(4), - cudaErrorProfilerDisabled(5), - cudaErrorProfilerNotInitialized(6), - cudaErrorProfilerAlreadyStarted(7), - cudaErrorProfilerAlreadyStopped(8), - cudaErrorInvalidConfiguration(9), - cudaErrorInvalidPitchValue(12), - cudaErrorInvalidSymbol(13), - cudaErrorInvalidHostPointer(16), - cudaErrorInvalidDevicePointer(17), - cudaErrorInvalidTexture(18), - cudaErrorInvalidTextureBinding(19), - cudaErrorInvalidChannelDescriptor(20), - cudaErrorInvalidMemcpyDirection(21), - cudaErrorAddressOfConstant(22), - cudaErrorTextureFetchFailed(23), - cudaErrorTextureNotBound(24), - cudaErrorSynchronizationError(25), - cudaErrorInvalidFilterSetting(26), - cudaErrorInvalidNormSetting(27), - cudaErrorMixedDeviceExecution(28), - cudaErrorNotYetImplemented(31), - cudaErrorMemoryValueTooLarge(32), - cudaErrorStubLibrary(34), - cudaErrorInsufficientDriver(35), - cudaErrorCallRequiresNewerDriver(36), - cudaErrorInvalidSurface(37), - cudaErrorDuplicateVariableName(43), - cudaErrorDuplicateTextureName(44), - cudaErrorDuplicateSurfaceName(45), - cudaErrorDevicesUnavailable(46), - cudaErrorIncompatibleDriverContext(49), - cudaErrorMissingConfiguration(52), - cudaErrorPriorLaunchFailure(53), - cudaErrorLaunchMaxDepthExceeded(65), - cudaErrorLaunchFileScopedTex(66), - cudaErrorLaunchFileScopedSurf(67), - cudaErrorSyncDepthExceeded(68), - cudaErrorLaunchPendingCountExceeded(69), - cudaErrorInvalidDeviceFunction(98), - cudaErrorNoDevice(100), - cudaErrorInvalidDevice(101), - cudaErrorDeviceNotLicensed(102), - cudaErrorSoftwareValidityNotEstablished(103), - cudaErrorStartupFailure(127), - cudaErrorInvalidKernelImage(200), - cudaErrorDeviceUninitialized(201), - cudaErrorMapBufferObjectFailed(205), - cudaErrorUnmapBufferObjectFailed(206), - cudaErrorArrayIsMapped(207), - cudaErrorAlreadyMapped(208), - cudaErrorNoKernelImageForDevice(209), - cudaErrorAlreadyAcquired(210), - cudaErrorNotMapped(211), - cudaErrorNotMappedAsArray(212), - cudaErrorNotMappedAsPointer(213), - cudaErrorECCUncorrectable(214), - cudaErrorUnsupportedLimit(215), - cudaErrorDeviceAlreadyInUse(216), - cudaErrorPeerAccessUnsupported(217), - cudaErrorInvalidPtx(218), - cudaErrorInvalidGraphicsContext(219), - cudaErrorNvlinkUncorrectable(220), - cudaErrorJitCompilerNotFound(221), - cudaErrorUnsupportedPtxVersion(222), - cudaErrorJitCompilationDisabled(223), - cudaErrorUnsupportedExecAffinity(224), - cudaErrorInvalidSource(300), - cudaErrorFileNotFound(301), - cudaErrorSharedObjectSymbolNotFound(302), - cudaErrorSharedObjectInitFailed(303), - cudaErrorOperatingSystem(304), - cudaErrorInvalidResourceHandle(400), - cudaErrorIllegalState(401), - cudaErrorSymbolNotFound(500), - cudaErrorNotReady(600), - cudaErrorIllegalAddress(700), - cudaErrorLaunchOutOfResources(701), - cudaErrorLaunchTimeout(702), - cudaErrorLaunchIncompatibleTexturing(703), - cudaErrorPeerAccessAlreadyEnabled(704), - cudaErrorPeerAccessNotEnabled(705), - cudaErrorSetOnActiveProcess(708), - cudaErrorContextIsDestroyed(709), - cudaErrorAssert(710), - cudaErrorTooManyPeers(711), - cudaErrorHostMemoryAlreadyRegistered(712), - cudaErrorHostMemoryNotRegistered(713), - cudaErrorHardwareStackError(714), - cudaErrorIllegalInstruction(715), - cudaErrorMisalignedAddress(716), - cudaErrorInvalidAddressSpace(717), - cudaErrorInvalidPc(718), - cudaErrorLaunchFailure(719), - cudaErrorCooperativeLaunchTooLarge(720), - cudaErrorNotPermitted(800), - cudaErrorNotSupported(801), - cudaErrorSystemNotReady(802), - cudaErrorSystemDriverMismatch(803), - cudaErrorCompatNotSupportedOnDevice(804), - cudaErrorMpsConnectionFailed(805), - cudaErrorMpsRpcFailure(806), - cudaErrorMpsServerNotReady(807), - cudaErrorMpsMaxClientsReached(808), - cudaErrorMpsMaxConnectionsReached(809), - cudaErrorStreamCaptureUnsupported(900), - cudaErrorStreamCaptureInvalidated(901), - cudaErrorStreamCaptureMerge(902), - cudaErrorStreamCaptureUnmatched(903), - cudaErrorStreamCaptureUnjoined(904), - cudaErrorStreamCaptureIsolation(905), - cudaErrorStreamCaptureImplicit(906), - cudaErrorCapturedEvent(907), - cudaErrorStreamCaptureWrongThread(908), - cudaErrorTimeout(909), - cudaErrorGraphExecUpdateFailure(910), - cudaErrorExternalDevice(911), - cudaErrorUnknown(999), - cudaErrorApiFailureBase(10000); - - final int code; - - CudaError(int errorCode) { - this.code = errorCode; - } - } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index ebaa1e90b5e..a45716a89b3 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -733,6 +733,41 @@ class native_jstringArray { } }; +/** + * @brief create a cuda exception from a given cudaError_t + */ +inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { + jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); + if (ex_class == NULL) { + return NULL; + } + jmethodID ctor_id = + env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/Throwable;)V"); + if (ctor_id == NULL) { + return NULL; + } + + jstring msg = env->NewStringUTF(cudaGetErrorString(status)); + if (msg == NULL) { + return NULL; + } + + jobject ret = env->NewObject(ex_class, ctor_id, msg, cause); + return (jthrowable)ret; +} + +inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { + if (cudaSuccess != cuda_status) { + // Clear the last error so it does not propagate. + cudaGetLastError(); + jthrowable jt = cuda_exception(env, cuda_status); + if (jt != NULL) { + env->Throw(jt); + throw jni_exception("CUDA ERROR"); + } + } +} + } // namespace jni } // namespace cudf @@ -755,35 +790,19 @@ class native_jstringArray { JNI_THROW_NEW(env, class_name, message, ret_val) \ } -// Throw a new exception only if one is not pending then always return with the specified value -#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, fatal, ret_val) \ - do { \ - if (env->ExceptionOccurred()) { \ - return ret_val; \ - } \ - jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); \ - const char *e_name = cudaGetErrorName(e.error_code()); \ - std::string what = std::string(fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + e_name + \ - "]: " + (e.what() == nullptr ? "" : e.what()); \ - env->ThrowNew(ex_class, what.c_str()); \ - return ret_val; \ - } while (0) - #define JNI_CUDA_TRY(env, ret_val, call) \ - do { \ + { \ cudaError_t internal_cuda_status = (call); \ if (cudaSuccess != internal_cuda_status) { \ - cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__); \ + /* Clear the last error so it does not propagate.*/ \ + cudaGetLastError(); \ + jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \ + if (jt != NULL) { \ + env->Throw(jt); \ + } \ return ret_val; \ } \ - } while (0) - -#define JNI_CUDA_CHECK(env, cuda_status) \ - do { \ - if (cudaSuccess != cuda_status) { \ - cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \ - } \ - } while (0) + } #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ @@ -812,12 +831,6 @@ class native_jstringArray { std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ - catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ - } \ - catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, true, ret_val); \ - } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 3af58605864..9862c3bface 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -41,23 +41,19 @@ void set_cudf_device(int device) { * is using the same device. */ void auto_set_device(JNIEnv *env) { - try { - if (Cudf_device != cudaInvalidDeviceId) { - if (Thread_device != Cudf_device) { - JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device)); - Thread_device = Cudf_device; - } + if (Cudf_device != cudaInvalidDeviceId) { + if (Thread_device != Cudf_device) { + cudaError_t cuda_status = cudaSetDevice(Cudf_device); + jni_cuda_check(env, cuda_status); + Thread_device = Cudf_device; } } - CATCH_STD(env, ); } /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { - try { - JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size())); - } - CATCH_STD(env, ); + cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); + jni_cuda_check(env, cuda_status); } } // namespace jni diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 41f1c23ac11..ce3e6ffb285 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -327,9 +327,11 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j jstring jpath, jlong pool_size) { try { // make sure the CUDA device is setup in the context - JNI_CUDA_CHECK(env, cudaFree(0)); + cudaError_t cuda_status = cudaFree(0); + cudf::jni::jni_cuda_check(env, cuda_status); int device_id; - JNI_CUDA_CHECK(env, cudaGetDevice(&device_id)); + cuda_status = cudaGetDevice(&device_id); + cudf::jni::jni_cuda_check(env, cuda_status); bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; From 56d00d71ab8fa6e166a34da3026e6043c68c4c15 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 17:38:05 +0800 Subject: [PATCH 09/11] revert JNI --- java/src/test/java/ai/rapids/cudf/CudaTest.java | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index f5af2b02fc5..8905c2edd56 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; public class CudaTest { @@ -32,17 +32,4 @@ public void testGetCudaRuntimeInfo() { assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId); } - @Test - public void testCudaException() { - assertThrows(CudaException.class, () -> { - try { - Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); - } catch (CudaException ex) { - assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); - assertFalse(ex.isFatal()); - throw ex; - } - } - ); - } } From 2e5844cdf634f6d9f742f7669c33bba6f2a885c7 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 14 Apr 2022 09:55:06 +0800 Subject: [PATCH 10/11] Update cpp/include/cudf/utilities/error.hpp Co-authored-by: Jake Hemstad --- cpp/include/cudf/utilities/error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 07874ae6ece..9cff693e375 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -121,7 +121,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; - // Calls cudaDeviceSynchronize to make sure that there is no other asynchronize error occurs + // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. // between two calls. if (error == last && last == cudaDeviceSynchronize()) { throw fatal_cuda_error{"Fatal " + msg, error}; From 53156c986eb84140571c9e9b5a8331545d865a0d Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 10:01:17 +0800 Subject: [PATCH 11/11] fix --- cpp/include/cudf/utilities/error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 9cff693e375..8f6190bbaf7 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -121,7 +121,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; - // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. + // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. // between two calls. if (error == last && last == cudaDeviceSynchronize()) { throw fatal_cuda_error{"Fatal " + msg, error};