From 7d2f1b24bc46fbbabfca1de78746177b5dd0dbdc Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 16:12:25 +0800 Subject: [PATCH 01/26] init Signed-off-by: sperlingxx --- .../java/ai/rapids/cudf/CudaException.java | 146 +++++++++++++++++- java/src/main/native/include/jni_utils.hpp | 37 +++-- java/src/main/native/src/CudaJni.cpp | 6 +- java/src/main/native/src/RmmJni.cpp | 4 +- 4 files changed, 174 insertions(+), 19 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2d862b47ef8..a515c05cf19 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,9 +30,153 @@ public class CudaException extends RuntimeException { CudaException(String message) { super(message); + this.cudaError = extractCudaError(message); } CudaException(String message, Throwable cause) { super(message, cause); + this.cudaError = extractCudaError(message); + } + + public final CudaError cudaError; + + public enum CudaError { + cudaErrorInvalidValue(1), + cudaErrorMemoryAllocation(2), + cudaErrorInitializationError(3), + cudaErrorCudartUnloading(4), + cudaErrorProfilerDisabled(5), + cudaErrorProfilerNotInitialized(6), + cudaErrorProfilerAlreadyStarted(7), + cudaErrorProfilerAlreadyStopped(8), + cudaErrorInvalidConfiguration(9), + cudaErrorInvalidPitchValue(12), + cudaErrorInvalidSymbol(13), + cudaErrorInvalidHostPointer(16), + cudaErrorInvalidDevicePointer(17), + cudaErrorInvalidTexture(18), + cudaErrorInvalidTextureBinding(19), + cudaErrorInvalidChannelDescriptor(20), + cudaErrorInvalidMemcpyDirection(21), + cudaErrorAddressOfConstant(22), + cudaErrorTextureFetchFailed(23), + cudaErrorTextureNotBound(24), + cudaErrorSynchronizationError(25), + cudaErrorInvalidFilterSetting(26), + cudaErrorInvalidNormSetting(27), + cudaErrorMixedDeviceExecution(28), + cudaErrorNotYetImplemented(31), + cudaErrorMemoryValueTooLarge(32), + cudaErrorStubLibrary(34), + cudaErrorInsufficientDriver(35), + cudaErrorCallRequiresNewerDriver(36), + cudaErrorInvalidSurface(37), + cudaErrorDuplicateVariableName(43), + cudaErrorDuplicateTextureName(44), + cudaErrorDuplicateSurfaceName(45), + cudaErrorDevicesUnavailable(46), + cudaErrorIncompatibleDriverContext(49), + cudaErrorMissingConfiguration(52), + cudaErrorPriorLaunchFailure(53), + cudaErrorLaunchMaxDepthExceeded(65), + cudaErrorLaunchFileScopedTex(66), + cudaErrorLaunchFileScopedSurf(67), + cudaErrorSyncDepthExceeded(68), + cudaErrorLaunchPendingCountExceeded(69), + cudaErrorInvalidDeviceFunction(98), + cudaErrorNoDevice(100), + cudaErrorInvalidDevice(101), + cudaErrorDeviceNotLicensed(102), + cudaErrorSoftwareValidityNotEstablished(103), + cudaErrorStartupFailure(127), + cudaErrorInvalidKernelImage(200), + cudaErrorDeviceUninitialized(201), + cudaErrorMapBufferObjectFailed(205), + cudaErrorUnmapBufferObjectFailed(206), + cudaErrorArrayIsMapped(207), + cudaErrorAlreadyMapped(208), + cudaErrorNoKernelImageForDevice(209), + cudaErrorAlreadyAcquired(210), + cudaErrorNotMapped(211), + cudaErrorNotMappedAsArray(212), + cudaErrorNotMappedAsPointer(213), + cudaErrorECCUncorrectable(214), + cudaErrorUnsupportedLimit(215), + cudaErrorDeviceAlreadyInUse(216), + cudaErrorPeerAccessUnsupported(217), + cudaErrorInvalidPtx(218), + cudaErrorInvalidGraphicsContext(219), + cudaErrorNvlinkUncorrectable(220), + cudaErrorJitCompilerNotFound(221), + cudaErrorUnsupportedPtxVersion(222), + cudaErrorJitCompilationDisabled(223), + cudaErrorUnsupportedExecAffinity(224), + cudaErrorInvalidSource(300), + cudaErrorFileNotFound(301), + cudaErrorSharedObjectSymbolNotFound(302), + cudaErrorSharedObjectInitFailed(303), + cudaErrorOperatingSystem(304), + cudaErrorInvalidResourceHandle(400), + cudaErrorIllegalState(401), + cudaErrorSymbolNotFound(500), + cudaErrorNotReady(600), + cudaErrorIllegalAddress(700), + cudaErrorLaunchOutOfResources(701), + cudaErrorLaunchTimeout(702), + cudaErrorLaunchIncompatibleTexturing(703), + cudaErrorPeerAccessAlreadyEnabled(704), + cudaErrorPeerAccessNotEnabled(705), + cudaErrorSetOnActiveProcess(708), + cudaErrorContextIsDestroyed(709), + cudaErrorAssert(710), + cudaErrorTooManyPeers(711), + cudaErrorHostMemoryAlreadyRegistered(712), + cudaErrorHostMemoryNotRegistered(713), + cudaErrorHardwareStackError(714), + cudaErrorIllegalInstruction(715), + cudaErrorMisalignedAddress(716), + cudaErrorInvalidAddressSpace(717), + cudaErrorInvalidPc(718), + cudaErrorLaunchFailure(719), + cudaErrorCooperativeLaunchTooLarge(720), + cudaErrorNotPermitted(800), + cudaErrorNotSupported(801), + cudaErrorSystemNotReady(802), + cudaErrorSystemDriverMismatch(803), + cudaErrorCompatNotSupportedOnDevice(804), + cudaErrorMpsConnectionFailed(805), + cudaErrorMpsRpcFailure(806), + cudaErrorMpsServerNotReady(807), + cudaErrorMpsMaxClientsReached(808), + cudaErrorMpsMaxConnectionsReached(809), + cudaErrorStreamCaptureUnsupported(900), + cudaErrorStreamCaptureInvalidated(901), + cudaErrorStreamCaptureMerge(902), + cudaErrorStreamCaptureUnmatched(903), + cudaErrorStreamCaptureUnjoined(904), + cudaErrorStreamCaptureIsolation(905), + cudaErrorStreamCaptureImplicit(906), + cudaErrorCapturedEvent(907), + cudaErrorStreamCaptureWrongThread(908), + cudaErrorTimeout(909), + cudaErrorGraphExecUpdateFailure(910), + cudaErrorExternalDevice(911), + cudaErrorUnknown(999), + cudaErrorApiFailureBase(10000); + + final int code; + + CudaError(int errorCode) { + this.code = errorCode; + } + } + + private static CudaError extractCudaError(String message) { + for (String segment : message.split(" ")) { + if (segment.startsWith("cudaError")) { + return CudaError.valueOf(segment); + } + } + throw new CudfException("invalid CUDA error message: " + message); } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index a45716a89b3..a83f12fdbed 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -736,7 +736,9 @@ class native_jstringArray { /** * @brief create a cuda exception from a given cudaError_t */ -inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { +inline jthrowable cuda_exception(JNIEnv *const env, + const char* file, unsigned int line, + cudaError_t status, jthrowable cause=NULL) { jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); if (ex_class == NULL) { return NULL; @@ -747,26 +749,30 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab return NULL; } - jstring msg = env->NewStringUTF(cudaGetErrorString(status)); + std::string n_msg {"CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; + jstring j_msg = env->NewStringUTF(cudaGetErrorString(n_msg)); if (msg == NULL) { return NULL; } - jobject ret = env->NewObject(ex_class, ctor_id, msg, cause); + jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause); return (jthrowable)ret; } -inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { - if (cudaSuccess != cuda_status) { - // Clear the last error so it does not propagate. - cudaGetLastError(); - jthrowable jt = cuda_exception(env, cuda_status); - if (jt != NULL) { - env->Throw(jt); - throw jni_exception("CUDA ERROR"); +#define JNI_CUDA_CHECK(env, cuda_status) + { + if (cudaSuccess != cuda_status) { + /* Clear the last error so it does not propagate.*/ \ + cudaGetLastError(); + jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status); + if (jt != NULL) { + env->Throw(jt); + throw jni_exception("CUDA ERROR"); + } } } -} } // namespace jni } // namespace cudf @@ -796,7 +802,7 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { if (cudaSuccess != internal_cuda_status) { \ /* Clear the last error so it does not propagate.*/ \ cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \ + jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status); \ if (jt != NULL) { \ env->Throw(jt); \ } \ @@ -831,6 +837,11 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ + catch (const cudf::cuda_error &e) { \ + auto what = \ + std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ + JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, what.c_str(), ret_val); \ + } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index e548b4ce65c..03d0cca0c1a 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ void auto_set_device(JNIEnv *env) { if (Cudf_device != cudaInvalidDeviceId) { if (Thread_device != Cudf_device) { cudaError_t cuda_status = cudaSetDevice(Cudf_device); - jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); Thread_device = Cudf_device; } } @@ -53,7 +53,7 @@ void auto_set_device(JNIEnv *env) { /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); - jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); } } // namespace jni diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index ce3e6ffb285..5d43a2b58f9 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -328,10 +328,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j try { // make sure the CUDA device is setup in the context cudaError_t cuda_status = cudaFree(0); - cudf::jni::jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); int device_id; cuda_status = cudaGetDevice(&device_id); - cudf::jni::jni_cuda_check(env, cuda_status); + JNI_CUDA_CHECK(env, cuda_status); bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; From 2755b448ff3cd7bc9c2518b4be4c7f2ab8b21652 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 16:20:29 +0800 Subject: [PATCH 02/26] fix --- java/src/main/native/include/jni_utils.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index a83f12fdbed..15d2d32aa95 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -749,14 +749,17 @@ inline jthrowable cuda_exception(JNIEnv *const env, return NULL; } - std::string n_msg {"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; - jstring j_msg = env->NewStringUTF(cudaGetErrorString(n_msg)); - if (msg == NULL) { + std::string err_name = cudaGetErrorName(status); + std::string err_string = cudaGetErrorString(status); + if (err_name == NULL) { return NULL; } + std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" + + std::to_string(line) + ": " + std::to_string(error) + " " + + err_name + " " + err_string; + jstring j_msg = env->NewStringUTF(n_msg); + jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause); return (jthrowable)ret; } From d0d7f30366305199a457f4aefa1cf57e79114539 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 17:19:02 +0800 Subject: [PATCH 03/26] fix --- java/src/main/native/include/jni_utils.hpp | 45 +++++++++++----------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 15d2d32aa95..d8fe1255790 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -736,9 +736,8 @@ class native_jstringArray { /** * @brief create a cuda exception from a given cudaError_t */ -inline jthrowable cuda_exception(JNIEnv *const env, - const char* file, unsigned int line, - cudaError_t status, jthrowable cause=NULL) { +inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned int line, + cudaError_t status, jthrowable cause = NULL) { jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); if (ex_class == NULL) { return NULL; @@ -749,34 +748,21 @@ inline jthrowable cuda_exception(JNIEnv *const env, return NULL; } - std::string err_name = cudaGetErrorName(status); - std::string err_string = cudaGetErrorString(status); - if (err_name == NULL) { + const char *err_name = cudaGetErrorName(status); + if (err_name == nullptr) { return NULL; } + const char *err_string = cudaGetErrorString(status); std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - err_name + " " + err_string; - jstring j_msg = env->NewStringUTF(n_msg); + std::to_string(line) + ": " + std::to_string(status) + " " + err_name + " " + + err_string; + jstring j_msg = env->NewStringUTF(n_msg.c_str()); jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause); return (jthrowable)ret; } -#define JNI_CUDA_CHECK(env, cuda_status) - { - if (cudaSuccess != cuda_status) { - /* Clear the last error so it does not propagate.*/ \ - cudaGetLastError(); - jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status); - if (jt != NULL) { - env->Throw(jt); - throw jni_exception("CUDA ERROR"); - } - } - } - } // namespace jni } // namespace cudf @@ -805,7 +791,7 @@ inline jthrowable cuda_exception(JNIEnv *const env, if (cudaSuccess != internal_cuda_status) { \ /* Clear the last error so it does not propagate.*/ \ cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status); \ + jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status); \ if (jt != NULL) { \ env->Throw(jt); \ } \ @@ -813,6 +799,19 @@ inline jthrowable cuda_exception(JNIEnv *const env, } \ } +#define JNI_CUDA_CHECK(env, cuda_status) \ + { \ + if (cudaSuccess != cuda_status) { \ + /* Clear the last error so it does not propagate.*/ \ + cudaGetLastError(); \ + jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status); \ + if (jt != NULL) { \ + env->Throw(jt); \ + throw cudf::jni::jni_exception("CUDA ERROR"); \ + } \ + } \ + } + #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ if ((obj) == 0) { \ From 8cc58a47824e77cf5fcec4ea23f0fb0f7fa63ed6 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 17:36:51 +0800 Subject: [PATCH 04/26] update --- java/src/main/native/include/jni_utils.hpp | 5 ++--- java/src/test/java/ai/rapids/cudf/CudaTest.java | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index d8fe1255790..7db745088b5 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -840,9 +840,8 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - auto what = \ - std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ - JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, what.c_str(), ret_val); \ + /* For CUDA errors, the specific error code will be extracted from error message. */ \ \ + JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 8905c2edd56..437ae60948f 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; public class CudaTest { @@ -32,4 +33,18 @@ public void testGetCudaRuntimeInfo() { assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId); } + @Test + public void testCudaException() { + assertThrows( + CudaException.class, + () -> { + try { + Cuda.asyncMemset(1234567890L, (byte) 0, 0); + } catch (CudaException ex) { + assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); + throw ex; + } + } + ); + } } From eced2c386832d79c7a6be94e24e0a7b5bc39da6b Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 17:39:55 +0800 Subject: [PATCH 05/26] fix --- java/src/main/native/include/jni_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 7db745088b5..294432253be 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -840,7 +840,7 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - /* For CUDA errors, the specific error code will be extracted from error message. */ \ \ + /* For CUDA errors, the specific error code will be extracted from error message. */ \ JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \ } \ catch (const std::exception &e) { \ From fa1ae8adf072e6d70ea9f25fa3b8c5ec7009c396 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 31 Mar 2022 19:15:28 +0800 Subject: [PATCH 06/26] update --- .../java/ai/rapids/cudf/CudaException.java | 29 +++++++++++++++++++ java/src/main/native/include/jni_utils.hpp | 2 ++ .../test/java/ai/rapids/cudf/CudaTest.java | 12 ++++---- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index a515c05cf19..23722600ec9 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -15,6 +15,9 @@ */ package ai.rapids.cudf; +import java.util.HashSet; +import java.util.Set; + /** * Exception from the cuda language/library. Be aware that because of how cuda does asynchronous * processing exceptions from cuda can be thrown by method calls that did not cause the exception @@ -40,6 +43,9 @@ public class CudaException extends RuntimeException { public final CudaError cudaError; + /** + * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. + */ public enum CudaError { cudaErrorInvalidValue(1), cudaErrorMemoryAllocation(2), @@ -166,9 +172,32 @@ public enum CudaError { final int code; + private static final Set stickyErrors = new HashSet(){{ + add(CudaError.cudaErrorIllegalAddress); + add(CudaError.cudaErrorLaunchTimeout); + add(CudaError.cudaErrorHardwareStackError); + add(CudaError.cudaErrorIllegalInstruction); + add(CudaError.cudaErrorMisalignedAddress); + add(CudaError.cudaErrorInvalidAddressSpace); + add(CudaError.cudaErrorInvalidPc); + add(CudaError.cudaErrorLaunchFailure); + add(CudaError.cudaErrorExternalDevice); + add(CudaError.cudaErrorUnknown); + }}; + CudaError(int errorCode) { this.code = errorCode; } + + /** + * Returns whether this CudaError is sticky or not. + * + * Sticky errors leave the process in an inconsistent state and any further CUDA work will return + * the same error. To continue using CUDA, the process must be terminated and relaunched. + */ + public boolean isSticky() { + return stickyErrors.contains(this); + } } private static CudaError extractCudaError(String message) { diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 294432253be..f599dd02e7d 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -754,6 +754,8 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i } const char *err_string = cudaGetErrorString(status); + // Build the error message in the format of cudf::cuda_error, so that cudf::jni::CUDA_ERROR_CLASS + // can parse both of them. std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(status) + " " + err_name + " " + err_string; diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 437ae60948f..c93f6b4ee0c 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -18,8 +18,7 @@ import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.*; public class CudaTest { @@ -35,13 +34,12 @@ public void testGetCudaRuntimeInfo() { @Test public void testCudaException() { - assertThrows( - CudaException.class, - () -> { + assertThrows(CudaException.class, () -> { try { - Cuda.asyncMemset(1234567890L, (byte) 0, 0); + Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); } catch (CudaException ex) { - assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); + assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); + assertFalse(ex.cudaError.isSticky()); throw ex; } } From 877a165dea96510808a4e643543bceb3b0107611 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Fri, 1 Apr 2022 12:00:11 +0800 Subject: [PATCH 07/26] refine --- .../java/ai/rapids/cudf/CudaException.java | 26 +++++++++---------- java/src/main/native/include/jni_utils.hpp | 4 +-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 23722600ec9..2b0308130eb 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -15,7 +15,7 @@ */ package ai.rapids.cudf; -import java.util.HashSet; +import java.util.EnumSet; import java.util.Set; /** @@ -172,18 +172,18 @@ public enum CudaError { final int code; - private static final Set stickyErrors = new HashSet(){{ - add(CudaError.cudaErrorIllegalAddress); - add(CudaError.cudaErrorLaunchTimeout); - add(CudaError.cudaErrorHardwareStackError); - add(CudaError.cudaErrorIllegalInstruction); - add(CudaError.cudaErrorMisalignedAddress); - add(CudaError.cudaErrorInvalidAddressSpace); - add(CudaError.cudaErrorInvalidPc); - add(CudaError.cudaErrorLaunchFailure); - add(CudaError.cudaErrorExternalDevice); - add(CudaError.cudaErrorUnknown); - }}; + private static final Set stickyErrors = EnumSet.of( + CudaError.cudaErrorIllegalAddress, + CudaError.cudaErrorLaunchTimeout, + CudaError.cudaErrorHardwareStackError, + CudaError.cudaErrorIllegalInstruction, + CudaError.cudaErrorMisalignedAddress, + CudaError.cudaErrorInvalidAddressSpace, + CudaError.cudaErrorInvalidPc, + CudaError.cudaErrorLaunchFailure, + CudaError.cudaErrorExternalDevice, + CudaError.cudaErrorUnknown + ); CudaError(int errorCode) { this.code = errorCode; diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index f599dd02e7d..89cf87d3eb0 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -802,7 +802,7 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i } #define JNI_CUDA_CHECK(env, cuda_status) \ - { \ + do { \ if (cudaSuccess != cuda_status) { \ /* Clear the last error so it does not propagate.*/ \ cudaGetLastError(); \ @@ -812,7 +812,7 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i throw cudf::jni::jni_exception("CUDA ERROR"); \ } \ } \ - } + } while(0) #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ From 3fb086fcdc68d6c9a2929cd6dc734161f19e8131 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Fri, 1 Apr 2022 12:17:43 +0800 Subject: [PATCH 08/26] fix --- java/src/main/native/include/jni_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 89cf87d3eb0..71b08604dbd 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -812,7 +812,7 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i throw cudf::jni::jni_exception("CUDA ERROR"); \ } \ } \ - } while(0) + } while (0) #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ From 71238e10cc2c0977205b1281f49ae11d142c5686 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Tue, 12 Apr 2022 16:19:08 +0800 Subject: [PATCH 09/26] update Signed-off-by: sperlingxx --- .../java/ai/rapids/cudf/CudaException.java | 58 +++++++------------ java/src/main/native/include/jni_utils.hpp | 23 +++++++- .../test/java/ai/rapids/cudf/CudaTest.java | 2 +- 3 files changed, 43 insertions(+), 40 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2b0308130eb..2cad2c09979 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -15,9 +15,6 @@ */ package ai.rapids.cudf; -import java.util.EnumSet; -import java.util.Set; - /** * Exception from the cuda language/library. Be aware that because of how cuda does asynchronous * processing exceptions from cuda can be thrown by method calls that did not cause the exception @@ -33,16 +30,36 @@ public class CudaException extends RuntimeException { CudaException(String message) { super(message); - this.cudaError = extractCudaError(message); + isFatal = message.startsWith("Fatal"); + cudaError = extractCudaError(message); } CudaException(String message, Throwable cause) { super(message, cause); - this.cudaError = extractCudaError(message); + isFatal = message.startsWith("Fatal"); + cudaError = extractCudaError(message); + } + + /** + * Returns whether this CudaError is fatal or not. + * + * Fatal errors leave the process in an inconsistent state and any further CUDA work will return + * the same error. To continue using CUDA, the process must be terminated and relaunched. + */ + public boolean isFatal() { + return isFatal; } public final CudaError cudaError; + private final boolean isFatal; + + private static CudaError extractCudaError(String msg) { + int startIdx = msg.indexOf('['); + int endIdx = msg.indexOf(']'); + return CudaError.valueOf(msg.substring(startIdx + 1, endIdx)); + } + /** * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. */ @@ -172,40 +189,9 @@ public enum CudaError { final int code; - private static final Set stickyErrors = EnumSet.of( - CudaError.cudaErrorIllegalAddress, - CudaError.cudaErrorLaunchTimeout, - CudaError.cudaErrorHardwareStackError, - CudaError.cudaErrorIllegalInstruction, - CudaError.cudaErrorMisalignedAddress, - CudaError.cudaErrorInvalidAddressSpace, - CudaError.cudaErrorInvalidPc, - CudaError.cudaErrorLaunchFailure, - CudaError.cudaErrorExternalDevice, - CudaError.cudaErrorUnknown - ); - CudaError(int errorCode) { this.code = errorCode; } - /** - * Returns whether this CudaError is sticky or not. - * - * Sticky errors leave the process in an inconsistent state and any further CUDA work will return - * the same error. To continue using CUDA, the process must be terminated and relaunched. - */ - public boolean isSticky() { - return stickyErrors.contains(this); - } - } - - private static CudaError extractCudaError(String message) { - for (String segment : message.split(" ")) { - if (segment.startsWith("cudaError")) { - return CudaError.valueOf(segment); - } - } - throw new CudfException("invalid CUDA error message: " + message); } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 89cf87d3eb0..f3d76e2cfd1 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -787,6 +787,21 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i JNI_THROW_NEW(env, class_name, message, ret_val) \ } +// Throw a new exception only if one is not pending then always return with the specified value +#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, fatal, ret_val) \ + { \ + if (env->ExceptionOccurred()) { \ + return ret_val; \ + } \ + jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); \ + const char* n_err_name = cudaGetErrorName(e.error_code()); \ + auto what = \ + (fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + \ + n_err_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ + env->ThrowNew(ex_class, message, what.c_str()); \ + return ret_val; \ + } + #define JNI_CUDA_TRY(env, ret_val, call) \ { \ cudaError_t internal_cuda_status = (call); \ @@ -841,9 +856,11 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ - catch (const cudf::cuda_error &e) { \ - /* For CUDA errors, the specific error code will be extracted from error message. */ \ - JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \ + catch (const cudf::fatal_cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, true, ret_val); \ + } \ + catch (const cudf::cudart_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index c93f6b4ee0c..f5af2b02fc5 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -39,7 +39,7 @@ public void testCudaException() { Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); } catch (CudaException ex) { assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); - assertFalse(ex.cudaError.isSticky()); + assertFalse(ex.isFatal()); throw ex; } } From c242002d2501e7aa0f0dbccfe1f2a96a30c2ae22 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 13 Apr 2022 17:43:22 +0800 Subject: [PATCH 10/26] update --- java/src/main/native/include/jni_utils.hpp | 70 ++++------------------ java/src/main/native/src/CudaJni.cpp | 46 +++++++++++--- java/src/main/native/src/RmmJni.cpp | 6 +- 3 files changed, 54 insertions(+), 68 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 662b787ce51..ebaa1e90b5e 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -733,38 +733,6 @@ class native_jstringArray { } }; -/** - * @brief create a cuda exception from a given cudaError_t - */ -inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned int line, - cudaError_t status, jthrowable cause = NULL) { - jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); - if (ex_class == NULL) { - return NULL; - } - jmethodID ctor_id = - env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/Throwable;)V"); - if (ctor_id == NULL) { - return NULL; - } - - const char *err_name = cudaGetErrorName(status); - if (err_name == nullptr) { - return NULL; - } - const char *err_string = cudaGetErrorString(status); - - // Build the error message in the format of cudf::cuda_error, so that cudf::jni::CUDA_ERROR_CLASS - // can parse both of them. - std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(status) + " " + err_name + " " + - err_string; - jstring j_msg = env->NewStringUTF(n_msg.c_str()); - - jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause); - return (jthrowable)ret; -} - } // namespace jni } // namespace cudf @@ -789,43 +757,31 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i // Throw a new exception only if one is not pending then always return with the specified value #define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, fatal, ret_val) \ - { \ + do { \ if (env->ExceptionOccurred()) { \ return ret_val; \ } \ jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); \ - const char* n_err_name = cudaGetErrorName(e.error_code()); \ - auto what = \ - (fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + \ - n_err_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ - env->ThrowNew(ex_class, message, what.c_str()); \ + const char *e_name = cudaGetErrorName(e.error_code()); \ + std::string what = std::string(fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + e_name + \ + "]: " + (e.what() == nullptr ? "" : e.what()); \ + env->ThrowNew(ex_class, what.c_str()); \ return ret_val; \ - } + } while (0) #define JNI_CUDA_TRY(env, ret_val, call) \ - { \ + do { \ cudaError_t internal_cuda_status = (call); \ if (cudaSuccess != internal_cuda_status) { \ - /* Clear the last error so it does not propagate.*/ \ - cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status); \ - if (jt != NULL) { \ - env->Throw(jt); \ - } \ + cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__); \ return ret_val; \ } \ - } + } while (0) #define JNI_CUDA_CHECK(env, cuda_status) \ do { \ if (cudaSuccess != cuda_status) { \ - /* Clear the last error so it does not propagate.*/ \ - cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status); \ - if (jt != NULL) { \ - env->Throw(jt); \ - throw cudf::jni::jni_exception("CUDA ERROR"); \ - } \ + cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \ } \ } while (0) @@ -856,12 +812,12 @@ inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned i std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ + catch (const cudf::cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ + } \ catch (const cudf::fatal_cuda_error &e) { \ JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, true, ret_val); \ } \ - catch (const cudf::cudart_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ - } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 03d0cca0c1a..3af58605864 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -41,19 +41,23 @@ void set_cudf_device(int device) { * is using the same device. */ void auto_set_device(JNIEnv *env) { - if (Cudf_device != cudaInvalidDeviceId) { - if (Thread_device != Cudf_device) { - cudaError_t cuda_status = cudaSetDevice(Cudf_device); - JNI_CUDA_CHECK(env, cuda_status); - Thread_device = Cudf_device; + try { + if (Cudf_device != cudaInvalidDeviceId) { + if (Thread_device != Cudf_device) { + JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device)); + Thread_device = Cudf_device; + } } } + CATCH_STD(env, ); } /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { - cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); - JNI_CUDA_CHECK(env, cuda_status); + try { + JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size())); + } + CATCH_STD(env, ); } } // namespace jni @@ -195,6 +199,34 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env CATCH_STD(env, -2); } +JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv *env, jclass) { + try { + cudf::jni::auto_set_device(env); + int device; + JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + int attribute_value; + JNI_CUDA_TRY( + env, -2, + ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device)); + return attribute_value; + } + CATCH_STD(env, -2); +} + +JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv *env, jclass) { + try { + cudf::jni::auto_set_device(env); + int device; + JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + int attribute_value; + JNI_CUDA_TRY( + env, -2, + ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device)); + return attribute_value; + } + CATCH_STD(env, -2); +} + JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 5d43a2b58f9..41f1c23ac11 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -327,11 +327,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j jstring jpath, jlong pool_size) { try { // make sure the CUDA device is setup in the context - cudaError_t cuda_status = cudaFree(0); - JNI_CUDA_CHECK(env, cuda_status); + JNI_CUDA_CHECK(env, cudaFree(0)); int device_id; - cuda_status = cudaGetDevice(&device_id); - JNI_CUDA_CHECK(env, cuda_status); + JNI_CUDA_CHECK(env, cudaGetDevice(&device_id)); bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; From 2e625af9aff5ffc28896c6a7a56c7c43dee1d4fc Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 16:59:50 +0800 Subject: [PATCH 11/26] update --- .../java/ai/rapids/cudf/CudaException.java | 14 --------- .../ai/rapids/cudf/FatalCudaException.java | 31 +++++++++++++++++++ java/src/main/native/include/jni_utils.hpp | 18 +++++------ 3 files changed, 39 insertions(+), 24 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/FatalCudaException.java diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2cad2c09979..b1c21941f07 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -30,30 +30,16 @@ public class CudaException extends RuntimeException { CudaException(String message) { super(message); - isFatal = message.startsWith("Fatal"); cudaError = extractCudaError(message); } CudaException(String message, Throwable cause) { super(message, cause); - isFatal = message.startsWith("Fatal"); cudaError = extractCudaError(message); } - /** - * Returns whether this CudaError is fatal or not. - * - * Fatal errors leave the process in an inconsistent state and any further CUDA work will return - * the same error. To continue using CUDA, the process must be terminated and relaunched. - */ - public boolean isFatal() { - return isFatal; - } - public final CudaError cudaError; - private final boolean isFatal; - private static CudaError extractCudaError(String msg) { int startIdx = msg.indexOf('['); int endIdx = msg.indexOf(']'); diff --git a/java/src/main/java/ai/rapids/cudf/FatalCudaException.java b/java/src/main/java/ai/rapids/cudf/FatalCudaException.java new file mode 100644 index 00000000000..c3edce377fa --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/FatalCudaException.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ai.rapids.cudf; + +/** + * FatalCudaException is a kind of CudaException which leaves the process in an inconsistent state + * and any further CUDA work will return the same error. + * To continue using CUDA, the process must be terminated and relaunched. + */ +public class FatalCudaException extends CudaException { + FatalCudaException(String message) { + super(message); + } + + FatalCudaException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index ebaa1e90b5e..479c244843f 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -30,6 +30,7 @@ namespace jni { constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6; constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException"; +constexpr char const *FATAL_CUDA_ERROR_CLASS = "ai/rapids/cudf/FatalCudaException"; constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException"; constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException"; constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; @@ -756,17 +757,14 @@ class native_jstringArray { } // Throw a new exception only if one is not pending then always return with the specified value -#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, fatal, ret_val) \ +#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, message, ret_val) \ do { \ if (env->ExceptionOccurred()) { \ return ret_val; \ } \ - jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); \ const char *e_name = cudaGetErrorName(e.error_code()); \ - std::string what = std::string(fatal ? "Fatal CUDA ERROR [" : "CUDA ERROR [") + e_name + \ - "]: " + (e.what() == nullptr ? "" : e.what()); \ - env->ThrowNew(ex_class, what.c_str()); \ - return ret_val; \ + std::string full_msg = "CUDA ERROR [") + e_name + "]: " + (message == nullptr ? "" : message); \ + JNI_THROW_NEW(env, class_name, full_msg, ret_val) \ } while (0) #define JNI_CUDA_TRY(env, ret_val, call) \ @@ -812,11 +810,11 @@ class native_jstringArray { std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ - catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, false, ret_val); \ - } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, e, true, ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::FATAL_CUDA_ERROR_CLASS, e.what(), ret_val); \ + } \ + catch (const cudf::cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ From f62961fad9dd56aba2593bb66a57404a88c8557e Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 17:52:08 +0800 Subject: [PATCH 12/26] update --- java/src/main/native/include/jni_utils.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 479c244843f..25ec9525284 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -757,13 +757,14 @@ class native_jstringArray { } // Throw a new exception only if one is not pending then always return with the specified value -#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, message, ret_val) \ +#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, ret_val) \ do { \ if (env->ExceptionOccurred()) { \ return ret_val; \ } \ const char *e_name = cudaGetErrorName(e.error_code()); \ - std::string full_msg = "CUDA ERROR [") + e_name + "]: " + (message == nullptr ? "" : message); \ + std::string full_msg = \ + "CUDA ERROR [" + e_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ JNI_THROW_NEW(env, class_name, full_msg, ret_val) \ } while (0) @@ -811,10 +812,10 @@ class native_jstringArray { JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::FATAL_CUDA_ERROR_CLASS, e.what(), ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::FATAL_CUDA_ERROR_CLASS, e, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ From fc5672dcea87e4d3ebc01ce39956f421b6e3fe31 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 18:44:51 +0800 Subject: [PATCH 13/26] fix --- java/src/main/native/include/jni_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 25ec9525284..7d38afcd2f9 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -764,8 +764,8 @@ class native_jstringArray { } \ const char *e_name = cudaGetErrorName(e.error_code()); \ std::string full_msg = \ - "CUDA ERROR [" + e_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ - JNI_THROW_NEW(env, class_name, full_msg, ret_val) \ + std::string("CUDA ERROR [") + e_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ + JNI_THROW_NEW(env, class_name, full_msg.c_str(), ret_val) \ } while (0) #define JNI_CUDA_TRY(env, ret_val, call) \ From 1594453263485a002739c111649af56c9a2af735 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 19:40:48 +0800 Subject: [PATCH 14/26] fix --- java/src/test/java/ai/rapids/cudf/CudaTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index f5af2b02fc5..524b7c05b64 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -37,9 +37,9 @@ public void testCudaException() { assertThrows(CudaException.class, () -> { try { Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); + } catch (FatalCudaException ignored) { } catch (CudaException ex) { assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); - assertFalse(ex.isFatal()); throw ex; } } From c11f8bbd59745a035c77bf149c2431aa9aaf2031 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 14 Apr 2022 19:54:00 +0800 Subject: [PATCH 15/26] fix --- .../{FatalCudaException.java => CudaFatalException.java} | 8 ++++---- java/src/main/native/include/jni_utils.hpp | 2 +- java/src/test/java/ai/rapids/cudf/CudaTest.java | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) rename java/src/main/java/ai/rapids/cudf/{FatalCudaException.java => CudaFatalException.java} (80%) diff --git a/java/src/main/java/ai/rapids/cudf/FatalCudaException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java similarity index 80% rename from java/src/main/java/ai/rapids/cudf/FatalCudaException.java rename to java/src/main/java/ai/rapids/cudf/CudaFatalException.java index c3edce377fa..0bb000d94d9 100644 --- a/java/src/main/java/ai/rapids/cudf/FatalCudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java @@ -16,16 +16,16 @@ package ai.rapids.cudf; /** - * FatalCudaException is a kind of CudaException which leaves the process in an inconsistent state + * CudaFatalException is a kind of CudaException which leaves the process in an inconsistent state * and any further CUDA work will return the same error. * To continue using CUDA, the process must be terminated and relaunched. */ -public class FatalCudaException extends CudaException { - FatalCudaException(String message) { +public class CudaFatalException extends CudaException { + CudaFatalException(String message) { super(message); } - FatalCudaException(String message, Throwable cause) { + CudaFatalException(String message, Throwable cause) { super(message, cause); } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 7d38afcd2f9..0861898f7fb 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -30,7 +30,7 @@ namespace jni { constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6; constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException"; -constexpr char const *FATAL_CUDA_ERROR_CLASS = "ai/rapids/cudf/FatalCudaException"; +constexpr char const *FATAL_CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException"; constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException"; constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException"; constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 524b7c05b64..1a86dbb374d 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -37,7 +37,7 @@ public void testCudaException() { assertThrows(CudaException.class, () -> { try { Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); - } catch (FatalCudaException ignored) { + } catch (CudaFatalException ignored) { } catch (CudaException ex) { assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); throw ex; From 5f991a18946bccbba05c37be4ee00d3c93244366 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Fri, 15 Apr 2022 11:43:56 +0800 Subject: [PATCH 16/26] update --- .../java/ai/rapids/cudf/CudaException.java | 14 ++++--------- .../ai/rapids/cudf/CudaFatalException.java | 8 +++---- java/src/main/native/include/jni_utils.hpp | 21 +++++++++++++------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index b1c21941f07..da5847fb6ad 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -28,24 +28,18 @@ * don't switch between threads for different parts of processing that can be retried as a chunk. */ public class CudaException extends RuntimeException { - CudaException(String message) { + CudaException(String message, String cudaErrorName) { super(message); - cudaError = extractCudaError(message); + cudaError = CudaError.valueOf(cudaErrorName); } - CudaException(String message, Throwable cause) { + CudaException(String message, String cudaErrorName, Throwable cause) { super(message, cause); - cudaError = extractCudaError(message); + cudaError = CudaError.valueOf(cudaErrorName); } public final CudaError cudaError; - private static CudaError extractCudaError(String msg) { - int startIdx = msg.indexOf('['); - int endIdx = msg.indexOf(']'); - return CudaError.valueOf(msg.substring(startIdx + 1, endIdx)); - } - /** * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. */ diff --git a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java index 0bb000d94d9..2bf4af7dadb 100644 --- a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java @@ -21,11 +21,11 @@ * To continue using CUDA, the process must be terminated and relaunched. */ public class CudaFatalException extends CudaException { - CudaFatalException(String message) { - super(message); + CudaFatalException(String message, String cudaErrorName) { + super(message, cudaErrorName); } - CudaFatalException(String message, Throwable cause) { - super(message, cause); + CudaFatalException(String message, String cudaErrorName, Throwable cause) { + super(message, cudaErrorName, cause); } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 0861898f7fb..ada0fc39cd4 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -30,7 +30,7 @@ namespace jni { constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6; constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException"; -constexpr char const *FATAL_CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException"; +constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException"; constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException"; constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException"; constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; @@ -762,10 +762,19 @@ class native_jstringArray { if (env->ExceptionOccurred()) { \ return ret_val; \ } \ - const char *e_name = cudaGetErrorName(e.error_code()); \ - std::string full_msg = \ - std::string("CUDA ERROR [") + e_name + "]: " + (e.what() == nullptr ? "" : e.what()); \ - JNI_THROW_NEW(env, class_name, full_msg.c_str(), ret_val) \ + std::string n_msg = std::string("CUDA ERROR: ") + (e.what() == nullptr ? "" : e.what()); \ + jstring j_msg = env->NewStringUTF(n_msg.c_str()); \ + const char *n_name = cudaGetErrorName(e.error_code()); \ + jstring j_name = env->NewStringUTF(n_name); \ + jclass ex_class = env->FindClass(class_name); \ + jmethodID ctor_id = \ + env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/String;)V"); \ + if (ctor_id == NULL) { \ + return ret_val; \ + } \ + jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, j_name); \ + env->Throw((jthrowable)cuda_error); \ + return ret_val; \ } while (0) #define JNI_CUDA_TRY(env, ret_val, call) \ @@ -812,7 +821,7 @@ class native_jstringArray { JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::FATAL_CUDA_ERROR_CLASS, e, ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ From 65633bcd9ee28948e8c7783cc6719981f1230c26 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Mon, 18 Apr 2022 13:49:53 +0800 Subject: [PATCH 17/26] refine --- .../java/ai/rapids/cudf/CudaException.java | 144 +++++++++++++++++- .../ai/rapids/cudf/CudaFatalException.java | 8 +- java/src/main/native/include/jni_utils.hpp | 31 ++-- java/src/main/native/src/CudaJni.cpp | 4 +- java/src/main/native/src/RmmJni.cpp | 6 +- 5 files changed, 173 insertions(+), 20 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index da5847fb6ad..64310933049 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -15,6 +15,10 @@ */ package ai.rapids.cudf; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; + /** * Exception from the cuda language/library. Be aware that because of how cuda does asynchronous * processing exceptions from cuda can be thrown by method calls that did not cause the exception @@ -28,14 +32,14 @@ * don't switch between threads for different parts of processing that can be retried as a chunk. */ public class CudaException extends RuntimeException { - CudaException(String message, String cudaErrorName) { + CudaException(String message, int errorCode) { super(message); - cudaError = CudaError.valueOf(cudaErrorName); + cudaError = CudaError.parseErrorCode(errorCode); } - CudaException(String message, String cudaErrorName, Throwable cause) { + CudaException(String message, int errorCode, Throwable cause) { super(message, cause); - cudaError = CudaError.valueOf(cudaErrorName); + cudaError = CudaError.parseErrorCode(errorCode); } public final CudaError cudaError; @@ -169,9 +173,141 @@ public enum CudaError { final int code; + private static Map codeToError = new HashMap(){{ + put(cudaErrorInvalidValue.code, cudaErrorInvalidValue); + put(cudaErrorMemoryAllocation.code, cudaErrorMemoryAllocation); + put(cudaErrorInitializationError.code, cudaErrorInitializationError); + put(cudaErrorCudartUnloading.code, cudaErrorCudartUnloading); + put(cudaErrorProfilerDisabled.code, cudaErrorProfilerDisabled); + put(cudaErrorProfilerNotInitialized.code, cudaErrorProfilerNotInitialized); + put(cudaErrorProfilerAlreadyStarted.code, cudaErrorProfilerAlreadyStarted); + put(cudaErrorProfilerAlreadyStopped.code, cudaErrorProfilerAlreadyStopped); + put(cudaErrorInvalidConfiguration.code, cudaErrorInvalidConfiguration); + put(cudaErrorInvalidPitchValue.code, cudaErrorInvalidPitchValue); + put(cudaErrorInvalidSymbol.code, cudaErrorInvalidSymbol); + put(cudaErrorInvalidHostPointer.code, cudaErrorInvalidHostPointer); + put(cudaErrorInvalidDevicePointer.code, cudaErrorInvalidDevicePointer); + put(cudaErrorInvalidTexture.code, cudaErrorInvalidTexture); + put(cudaErrorInvalidTextureBinding.code, cudaErrorInvalidTextureBinding); + put(cudaErrorInvalidChannelDescriptor.code, cudaErrorInvalidChannelDescriptor); + put(cudaErrorInvalidMemcpyDirection.code, cudaErrorInvalidMemcpyDirection); + put(cudaErrorAddressOfConstant.code, cudaErrorAddressOfConstant); + put(cudaErrorTextureFetchFailed.code, cudaErrorTextureFetchFailed); + put(cudaErrorTextureNotBound.code, cudaErrorTextureNotBound); + put(cudaErrorSynchronizationError.code, cudaErrorSynchronizationError); + put(cudaErrorInvalidFilterSetting.code, cudaErrorInvalidFilterSetting); + put(cudaErrorInvalidNormSetting.code, cudaErrorInvalidNormSetting); + put(cudaErrorMixedDeviceExecution.code, cudaErrorMixedDeviceExecution); + put(cudaErrorNotYetImplemented.code, cudaErrorNotYetImplemented); + put(cudaErrorMemoryValueTooLarge.code, cudaErrorMemoryValueTooLarge); + put(cudaErrorStubLibrary.code, cudaErrorStubLibrary); + put(cudaErrorInsufficientDriver.code, cudaErrorInsufficientDriver); + put(cudaErrorCallRequiresNewerDriver.code, cudaErrorCallRequiresNewerDriver); + put(cudaErrorInvalidSurface.code, cudaErrorInvalidSurface); + put(cudaErrorDuplicateVariableName.code, cudaErrorDuplicateVariableName); + put(cudaErrorDuplicateTextureName.code, cudaErrorDuplicateTextureName); + put(cudaErrorDuplicateSurfaceName.code, cudaErrorDuplicateSurfaceName); + put(cudaErrorDevicesUnavailable.code, cudaErrorDevicesUnavailable); + put(cudaErrorIncompatibleDriverContext.code, cudaErrorIncompatibleDriverContext); + put(cudaErrorMissingConfiguration.code, cudaErrorMissingConfiguration); + put(cudaErrorPriorLaunchFailure.code, cudaErrorPriorLaunchFailure); + put(cudaErrorLaunchMaxDepthExceeded.code, cudaErrorLaunchMaxDepthExceeded); + put(cudaErrorLaunchFileScopedTex.code, cudaErrorLaunchFileScopedTex); + put(cudaErrorLaunchFileScopedSurf.code, cudaErrorLaunchFileScopedSurf); + put(cudaErrorSyncDepthExceeded.code, cudaErrorSyncDepthExceeded); + put(cudaErrorLaunchPendingCountExceeded.code, cudaErrorLaunchPendingCountExceeded); + put(cudaErrorInvalidDeviceFunction.code, cudaErrorInvalidDeviceFunction); + put(cudaErrorNoDevice.code, cudaErrorNoDevice); + put(cudaErrorInvalidDevice.code, cudaErrorInvalidDevice); + put(cudaErrorDeviceNotLicensed.code, cudaErrorDeviceNotLicensed); + put(cudaErrorSoftwareValidityNotEstablished.code, cudaErrorSoftwareValidityNotEstablished); + put(cudaErrorStartupFailure.code, cudaErrorStartupFailure); + put(cudaErrorInvalidKernelImage.code, cudaErrorInvalidKernelImage); + put(cudaErrorDeviceUninitialized.code, cudaErrorDeviceUninitialized); + put(cudaErrorMapBufferObjectFailed.code, cudaErrorMapBufferObjectFailed); + put(cudaErrorUnmapBufferObjectFailed.code, cudaErrorUnmapBufferObjectFailed); + put(cudaErrorArrayIsMapped.code, cudaErrorArrayIsMapped); + put(cudaErrorAlreadyMapped.code, cudaErrorAlreadyMapped); + put(cudaErrorNoKernelImageForDevice.code, cudaErrorNoKernelImageForDevice); + put(cudaErrorAlreadyAcquired.code, cudaErrorAlreadyAcquired); + put(cudaErrorNotMapped.code, cudaErrorNotMapped); + put(cudaErrorNotMappedAsArray.code, cudaErrorNotMappedAsArray); + put(cudaErrorNotMappedAsPointer.code, cudaErrorNotMappedAsPointer); + put(cudaErrorECCUncorrectable.code, cudaErrorECCUncorrectable); + put(cudaErrorUnsupportedLimit.code, cudaErrorUnsupportedLimit); + put(cudaErrorDeviceAlreadyInUse.code, cudaErrorDeviceAlreadyInUse); + put(cudaErrorPeerAccessUnsupported.code, cudaErrorPeerAccessUnsupported); + put(cudaErrorInvalidPtx.code, cudaErrorInvalidPtx); + put(cudaErrorInvalidGraphicsContext.code, cudaErrorInvalidGraphicsContext); + put(cudaErrorNvlinkUncorrectable.code, cudaErrorNvlinkUncorrectable); + put(cudaErrorJitCompilerNotFound.code, cudaErrorJitCompilerNotFound); + put(cudaErrorUnsupportedPtxVersion.code, cudaErrorUnsupportedPtxVersion); + put(cudaErrorJitCompilationDisabled.code, cudaErrorJitCompilationDisabled); + put(cudaErrorUnsupportedExecAffinity.code, cudaErrorUnsupportedExecAffinity); + put(cudaErrorInvalidSource.code, cudaErrorInvalidSource); + put(cudaErrorFileNotFound.code, cudaErrorFileNotFound); + put(cudaErrorSharedObjectSymbolNotFound.code, cudaErrorSharedObjectSymbolNotFound); + put(cudaErrorSharedObjectInitFailed.code, cudaErrorSharedObjectInitFailed); + put(cudaErrorOperatingSystem.code, cudaErrorOperatingSystem); + put(cudaErrorInvalidResourceHandle.code, cudaErrorInvalidResourceHandle); + put(cudaErrorIllegalState.code, cudaErrorIllegalState); + put(cudaErrorSymbolNotFound.code, cudaErrorSymbolNotFound); + put(cudaErrorNotReady.code, cudaErrorNotReady); + put(cudaErrorIllegalAddress.code, cudaErrorIllegalAddress); + put(cudaErrorLaunchOutOfResources.code, cudaErrorLaunchOutOfResources); + put(cudaErrorLaunchTimeout.code, cudaErrorLaunchTimeout); + put(cudaErrorLaunchIncompatibleTexturing.code, cudaErrorLaunchIncompatibleTexturing); + put(cudaErrorPeerAccessAlreadyEnabled.code, cudaErrorPeerAccessAlreadyEnabled); + put(cudaErrorPeerAccessNotEnabled.code, cudaErrorPeerAccessNotEnabled); + put(cudaErrorSetOnActiveProcess.code, cudaErrorSetOnActiveProcess); + put(cudaErrorContextIsDestroyed.code, cudaErrorContextIsDestroyed); + put(cudaErrorAssert.code, cudaErrorAssert); + put(cudaErrorTooManyPeers.code, cudaErrorTooManyPeers); + put(cudaErrorHostMemoryAlreadyRegistered.code, cudaErrorHostMemoryAlreadyRegistered); + put(cudaErrorHostMemoryNotRegistered.code, cudaErrorHostMemoryNotRegistered); + put(cudaErrorHardwareStackError.code, cudaErrorHardwareStackError); + put(cudaErrorIllegalInstruction.code, cudaErrorIllegalInstruction); + put(cudaErrorMisalignedAddress.code, cudaErrorMisalignedAddress); + put(cudaErrorInvalidAddressSpace.code, cudaErrorInvalidAddressSpace); + put(cudaErrorInvalidPc.code, cudaErrorInvalidPc); + put(cudaErrorLaunchFailure.code, cudaErrorLaunchFailure); + put(cudaErrorCooperativeLaunchTooLarge.code, cudaErrorCooperativeLaunchTooLarge); + put(cudaErrorNotPermitted.code, cudaErrorNotPermitted); + put(cudaErrorNotSupported.code, cudaErrorNotSupported); + put(cudaErrorSystemNotReady.code, cudaErrorSystemNotReady); + put(cudaErrorSystemDriverMismatch.code, cudaErrorSystemDriverMismatch); + put(cudaErrorCompatNotSupportedOnDevice.code, cudaErrorCompatNotSupportedOnDevice); + put(cudaErrorMpsConnectionFailed.code, cudaErrorMpsConnectionFailed); + put(cudaErrorMpsRpcFailure.code, cudaErrorMpsRpcFailure); + put(cudaErrorMpsServerNotReady.code, cudaErrorMpsServerNotReady); + put(cudaErrorMpsMaxClientsReached.code, cudaErrorMpsMaxClientsReached); + put(cudaErrorMpsMaxConnectionsReached.code, cudaErrorMpsMaxConnectionsReached); + put(cudaErrorStreamCaptureUnsupported.code, cudaErrorStreamCaptureUnsupported); + put(cudaErrorStreamCaptureInvalidated.code, cudaErrorStreamCaptureInvalidated); + put(cudaErrorStreamCaptureMerge.code, cudaErrorStreamCaptureMerge); + put(cudaErrorStreamCaptureUnmatched.code, cudaErrorStreamCaptureUnmatched); + put(cudaErrorStreamCaptureUnjoined.code, cudaErrorStreamCaptureUnjoined); + put(cudaErrorStreamCaptureIsolation.code, cudaErrorStreamCaptureIsolation); + put(cudaErrorStreamCaptureImplicit.code, cudaErrorStreamCaptureImplicit); + put(cudaErrorCapturedEvent.code, cudaErrorCapturedEvent); + put(cudaErrorStreamCaptureWrongThread.code, cudaErrorStreamCaptureWrongThread); + put(cudaErrorTimeout.code, cudaErrorTimeout); + put(cudaErrorGraphExecUpdateFailure.code, cudaErrorGraphExecUpdateFailure); + put(cudaErrorExternalDevice.code, cudaErrorExternalDevice); + put(cudaErrorUnknown.code, cudaErrorUnknown); + put(cudaErrorApiFailureBase.code, cudaErrorApiFailureBase); + }}; + CudaError(int errorCode) { this.code = errorCode; } + public static CudaError parseErrorCode(int errorCode) { + if (!codeToError.containsKey(errorCode)) { + throw new CudfException("Unknown Cuda error code: " + errorCode); + } + return codeToError.get(errorCode); + } + } } diff --git a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java index 2bf4af7dadb..cf36726aa80 100644 --- a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java @@ -21,11 +21,11 @@ * To continue using CUDA, the process must be terminated and relaunched. */ public class CudaFatalException extends CudaException { - CudaFatalException(String message, String cudaErrorName) { - super(message, cudaErrorName); + CudaFatalException(String message, int errorCode) { + super(message, errorCode); } - CudaFatalException(String message, String cudaErrorName, Throwable cause) { - super(message, cudaErrorName, cause); + CudaFatalException(String message, int errorCode, Throwable cause) { + super(message, errorCode, cause); } } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index ada0fc39cd4..41bc4e5e124 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -734,6 +734,10 @@ class native_jstringArray { } }; +inline void throw_jni_exception(const char *msg) { + throw jni_exception(msg); +} + } // namespace jni } // namespace cudf @@ -757,23 +761,24 @@ class native_jstringArray { } // Throw a new exception only if one is not pending then always return with the specified value -#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, ret_val) \ +#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, is_halt, ret_val) \ do { \ if (env->ExceptionOccurred()) { \ return ret_val; \ } \ - std::string n_msg = std::string("CUDA ERROR: ") + (e.what() == nullptr ? "" : e.what()); \ + std::string n_msg = e.what() == nullptr ? "" : e.what(); \ jstring j_msg = env->NewStringUTF(n_msg.c_str()); \ - const char *n_name = cudaGetErrorName(e.error_code()); \ - jstring j_name = env->NewStringUTF(n_name); \ + jint e_code = static_cast(e.error_code()); \ jclass ex_class = env->FindClass(class_name); \ - jmethodID ctor_id = \ - env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/String;)V"); \ + jmethodID ctor_id = env->GetMethodID(ex_class, "", "(Ljava/lang/String;I)V"); \ if (ctor_id == NULL) { \ return ret_val; \ } \ - jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, j_name); \ + jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ env->Throw((jthrowable)cuda_error); \ + if (is_halt) { \ + cudf::jni::throw_jni_exception("CUDA ERROR"); \ + } \ return ret_val; \ } while (0) @@ -814,6 +819,14 @@ class native_jstringArray { } \ } +#define CATCH_CUDA_ERROR_AND_THROW(env, ret_val) \ + catch (const cudf::fatal_cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, true, ret_val); \ + } \ + catch (const cudf::cuda_error &e) { \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, true, ret_val); \ + } + #define CATCH_STD_CLASS(env, class_name, ret_val) \ catch (const rmm::out_of_memory &e) { \ auto what = \ @@ -821,10 +834,10 @@ class native_jstringArray { JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, false, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ + JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, false, ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 3af58605864..3fe6da95759 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -49,7 +49,7 @@ void auto_set_device(JNIEnv *env) { } } } - CATCH_STD(env, ); + CATCH_CUDA_ERROR_AND_THROW(env, ); } /** Fills all the bytes in the buffer 'buf' with 'value'. */ @@ -57,7 +57,7 @@ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { try { JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size())); } - CATCH_STD(env, ); + CATCH_CUDA_ERROR_AND_THROW(env, ); } } // namespace jni diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 41f1c23ac11..f422ebc07c3 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -325,12 +325,16 @@ extern "C" { JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz, jint allocation_mode, jint log_to, jstring jpath, jlong pool_size) { + int device_id; + try { // make sure the CUDA device is setup in the context JNI_CUDA_CHECK(env, cudaFree(0)); - int device_id; JNI_CUDA_CHECK(env, cudaGetDevice(&device_id)); + } + CATCH_CUDA_ERROR_AND_THROW(env, ); + try { bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; bool use_arena_alloc = allocation_mode & 4; From 5bc9e6e6005334c7e3035a3d4d9b76f520826ea0 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Mon, 18 Apr 2022 14:37:48 +0800 Subject: [PATCH 18/26] update --- .../java/ai/rapids/cudf/CudaException.java | 4 +- java/src/main/native/include/jni_utils.hpp | 4 +- java/src/main/native/src/CudaJni.cpp | 70 +++++++++---------- java/src/main/native/src/RmmJni.cpp | 4 +- 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 64310933049..ff7ca308f3c 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -15,7 +15,6 @@ */ package ai.rapids.cudf; -import java.util.EnumMap; import java.util.HashMap; import java.util.Map; @@ -48,6 +47,7 @@ public class CudaException extends RuntimeException { * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. */ public enum CudaError { + UnknownNativeError(-1), // native CUDA error type which Java doesn't have a representation cudaErrorInvalidValue(1), cudaErrorMemoryAllocation(2), cudaErrorInitializationError(3), @@ -304,7 +304,7 @@ public enum CudaError { public static CudaError parseErrorCode(int errorCode) { if (!codeToError.containsKey(errorCode)) { - throw new CudfException("Unknown Cuda error code: " + errorCode); + return UnknownNativeError; } return codeToError.get(errorCode); } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 41bc4e5e124..e6322993cac 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -782,7 +782,7 @@ inline void throw_jni_exception(const char *msg) { return ret_val; \ } while (0) -#define JNI_CUDA_TRY(env, ret_val, call) \ +#define JNI_CUDA_TRY(ret_val, call) \ do { \ cudaError_t internal_cuda_status = (call); \ if (cudaSuccess != internal_cuda_status) { \ @@ -791,7 +791,7 @@ inline void throw_jni_exception(const char *msg) { } \ } while (0) -#define JNI_CUDA_CHECK(env, cuda_status) \ +#define JNI_CUDA_CHECK(cuda_status) \ do { \ if (cudaSuccess != cuda_status) { \ cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 3fe6da95759..f17f0beed22 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -44,7 +44,7 @@ void auto_set_device(JNIEnv *env) { try { if (Cudf_device != cudaInvalidDeviceId) { if (Thread_device != Cudf_device) { - JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device)); + JNI_CUDA_CHECK(cudaSetDevice(Cudf_device)); Thread_device = Cudf_device; } } @@ -55,7 +55,7 @@ void auto_set_device(JNIEnv *env) { /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { try { - JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size())); + JNI_CUDA_CHECK(cudaMemsetAsync((void *)buf.data(), value, buf.size())); } CATCH_CUDA_ERROR_AND_THROW(env, ); } @@ -70,7 +70,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas cudf::jni::auto_set_device(env); size_t free, total; - JNI_CUDA_TRY(env, NULL, cudaMemGetInfo(&free, &total)); + JNI_CUDA_TRY(NULL, cudaMemGetInfo(&free, &total)); jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;"); if (info_class == NULL) { @@ -94,7 +94,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); void *ret = nullptr; - JNI_CUDA_TRY(env, 0, cudaMallocHost(&ret, size)); + JNI_CUDA_TRY(0, cudaMallocHost(&ret, size)); return reinterpret_cast(ret); } CATCH_STD(env, 0); @@ -103,7 +103,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaFreeHost(reinterpret_cast(ptr))); + JNI_CUDA_TRY(, cudaFreeHost(reinterpret_cast(ptr))); } CATCH_STD(env, ); } @@ -113,8 +113,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlon JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count)); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(0)); + JNI_CUDA_TRY(, cudaMemsetAsync((void *)dst, value, count)); + JNI_CUDA_TRY(, cudaStreamSynchronize(0)); } CATCH_STD(env, ); } @@ -124,7 +124,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count)); + JNI_CUDA_TRY(, cudaMemsetAsync((void *)dst, value, count)); } CATCH_STD(env, ); } @@ -133,7 +133,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); jint dev; - JNI_CUDA_TRY(env, -2, cudaGetDevice(&dev)); + JNI_CUDA_TRY(-2, cudaGetDevice(&dev)); return dev; } CATCH_STD(env, -2); @@ -143,7 +143,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla try { cudf::jni::auto_set_device(env); jint count; - JNI_CUDA_TRY(env, -2, cudaGetDeviceCount(&count)); + JNI_CUDA_TRY(-2, cudaGetDeviceCount(&count)); return count; } CATCH_STD(env, -2); @@ -155,7 +155,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init"); } - JNI_CUDA_TRY(env, , cudaSetDevice(dev)); + JNI_CUDA_TRY(, cudaSetDevice(dev)); } CATCH_STD(env, ); } @@ -171,7 +171,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); jint driver_version; - JNI_CUDA_TRY(env, -2, cudaDriverGetVersion(&driver_version)); + JNI_CUDA_TRY(-2, cudaDriverGetVersion(&driver_version)); return driver_version; } CATCH_STD(env, -2); @@ -181,7 +181,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j try { cudf::jni::auto_set_device(env); jint runtime_version; - JNI_CUDA_TRY(env, -2, cudaRuntimeGetVersion(&runtime_version)); + JNI_CUDA_TRY(-2, cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } CATCH_STD(env, -2); @@ -191,9 +191,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, cudaGetDevice(&device)); + JNI_CUDA_TRY(-2, cudaGetDevice(&device)); cudaDeviceProp device_prop; - JNI_CUDA_TRY(env, -2, cudaGetDeviceProperties(&device_prop, device)); + JNI_CUDA_TRY(-2, cudaGetDeviceProperties(&device_prop, device)); return device_prop.computeMode; } CATCH_STD(env, -2); @@ -203,11 +203,10 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + JNI_CUDA_TRY(-2, ::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY( - env, -2, - ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device)); + JNI_CUDA_TRY(-2, ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, + device)); return attribute_value; } CATCH_STD(env, -2); @@ -217,11 +216,10 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + JNI_CUDA_TRY(-2, ::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY( - env, -2, - ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device)); + JNI_CUDA_TRY(-2, ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, + device)); return attribute_value; } CATCH_STD(env, -2); @@ -230,7 +228,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaFree(0)); + JNI_CUDA_TRY(, cudaFree(0)); } CATCH_STD(env, ); } @@ -241,7 +239,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclas cudf::jni::auto_set_device(env); cudaStream_t stream = nullptr; auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; - JNI_CUDA_TRY(env, 0, cudaStreamCreateWithFlags(&stream, flags)); + JNI_CUDA_TRY(0, cudaStreamCreateWithFlags(&stream, flags)); return reinterpret_cast(stream); } CATCH_STD(env, 0); @@ -251,7 +249,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaStreamDestroy(stream)); + JNI_CUDA_TRY(, cudaStreamDestroy(stream)); } CATCH_STD(env, ); } @@ -262,7 +260,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jcl cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaStreamWaitEvent(stream, event, 0)); + JNI_CUDA_TRY(, cudaStreamWaitEvent(stream, event, 0)); } CATCH_STD(env, ); } @@ -272,7 +270,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream)); + JNI_CUDA_TRY(, cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -290,7 +288,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass if (blockingSync) { flags = flags | cudaEventBlockingSync; } - JNI_CUDA_TRY(env, 0, cudaEventCreateWithFlags(&event, flags)); + JNI_CUDA_TRY(0, cudaEventCreateWithFlags(&event, flags)); return reinterpret_cast(event); } CATCH_STD(env, 0); @@ -300,7 +298,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaEventDestroy(event)); + JNI_CUDA_TRY(, cudaEventDestroy(event)); } CATCH_STD(env, ); } @@ -315,7 +313,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jcla } else if (result == cudaErrorNotReady) { return false; } // else - JNI_CUDA_TRY(env, false, result); + JNI_CUDA_TRY(false, result); } CATCH_STD(env, false); return false; @@ -327,7 +325,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaEventRecord(event, stream)); + JNI_CUDA_TRY(, cudaEventRecord(event, stream)); } CATCH_STD(env, ); } @@ -337,7 +335,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaEventSynchronize(event)); + JNI_CUDA_TRY(, cudaEventSynchronize(event)); } CATCH_STD(env, ); } @@ -356,8 +354,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream)); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream)); + JNI_CUDA_TRY(, cudaMemcpyAsync(dst, src, count, kind, stream)); + JNI_CUDA_TRY(, cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -376,7 +374,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream)); + JNI_CUDA_TRY(, cudaMemcpyAsync(dst, src, count, kind, stream)); } CATCH_STD(env, ); } diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index f422ebc07c3..58123789af2 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -329,8 +329,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j try { // make sure the CUDA device is setup in the context - JNI_CUDA_CHECK(env, cudaFree(0)); - JNI_CUDA_CHECK(env, cudaGetDevice(&device_id)); + JNI_CUDA_CHECK(cudaFree(0)); + JNI_CUDA_CHECK(cudaGetDevice(&device_id)); } CATCH_CUDA_ERROR_AND_THROW(env, ); From 0b548290728bf566746ae75cd2b382b45c97aabe Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Tue, 19 Apr 2022 11:11:18 +0800 Subject: [PATCH 19/26] update --- java/src/main/native/include/jni_utils.hpp | 34 ++++------- java/src/main/native/src/CudaJni.cpp | 69 +++++++++++----------- java/src/main/native/src/RmmJni.cpp | 5 +- 3 files changed, 51 insertions(+), 57 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index e6322993cac..003a58bb6fe 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -761,8 +761,8 @@ inline void throw_jni_exception(const char *msg) { } // Throw a new exception only if one is not pending then always return with the specified value -#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, is_halt, ret_val) \ - do { \ +#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val) \ + { \ if (env->ExceptionOccurred()) { \ return ret_val; \ } \ @@ -776,26 +776,18 @@ inline void throw_jni_exception(const char *msg) { } \ jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ env->Throw((jthrowable)cuda_error); \ - if (is_halt) { \ - cudf::jni::throw_jni_exception("CUDA ERROR"); \ - } \ - return ret_val; \ - } while (0) + } -#define JNI_CUDA_TRY(ret_val, call) \ +#define JNI_CHECK_CUDA_ERROR_RETURN(env, class_name, e, ret_val) \ do { \ - cudaError_t internal_cuda_status = (call); \ - if (cudaSuccess != internal_cuda_status) { \ - cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__); \ - return ret_val; \ - } \ + JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val); \ + return ret_val; \ } while (0) -#define JNI_CUDA_CHECK(cuda_status) \ +#define JNI_CHECK_CUDA_ERROR_THROW(env, class_name, e, ret_val) \ do { \ - if (cudaSuccess != cuda_status) { \ - cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \ - } \ + JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val); \ + cudf::jni::throw_jni_exception("CUDA ERROR"); \ } while (0) #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ @@ -821,10 +813,10 @@ inline void throw_jni_exception(const char *msg) { #define CATCH_CUDA_ERROR_AND_THROW(env, ret_val) \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, true, ret_val); \ + JNI_CHECK_CUDA_ERROR_THROW(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, true, ret_val); \ + JNI_CHECK_CUDA_ERROR_THROW(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ } #define CATCH_STD_CLASS(env, class_name, ret_val) \ @@ -834,10 +826,10 @@ inline void throw_jni_exception(const char *msg) { JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, false, ret_val); \ + JNI_CHECK_CUDA_ERROR_RETURN(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, false, ret_val); \ + JNI_CHECK_CUDA_ERROR_RETURN(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index f17f0beed22..9227c036f3c 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include "jni_utils.hpp" @@ -44,7 +45,7 @@ void auto_set_device(JNIEnv *env) { try { if (Cudf_device != cudaInvalidDeviceId) { if (Thread_device != Cudf_device) { - JNI_CUDA_CHECK(cudaSetDevice(Cudf_device)); + CUDF_CUDA_TRY(cudaSetDevice(Cudf_device)); Thread_device = Cudf_device; } } @@ -55,7 +56,7 @@ void auto_set_device(JNIEnv *env) { /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { try { - JNI_CUDA_CHECK(cudaMemsetAsync((void *)buf.data(), value, buf.size())); + CUDF_CUDA_TRY(cudaMemsetAsync((void *)buf.data(), value, buf.size())); } CATCH_CUDA_ERROR_AND_THROW(env, ); } @@ -70,7 +71,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas cudf::jni::auto_set_device(env); size_t free, total; - JNI_CUDA_TRY(NULL, cudaMemGetInfo(&free, &total)); + CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;"); if (info_class == NULL) { @@ -94,7 +95,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); void *ret = nullptr; - JNI_CUDA_TRY(0, cudaMallocHost(&ret, size)); + CUDF_CUDA_TRY(cudaMallocHost(&ret, size)); return reinterpret_cast(ret); } CATCH_STD(env, 0); @@ -103,7 +104,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(, cudaFreeHost(reinterpret_cast(ptr))); + CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast(ptr))); } CATCH_STD(env, ); } @@ -113,8 +114,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlon JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(, cudaMemsetAsync((void *)dst, value, count)); - JNI_CUDA_TRY(, cudaStreamSynchronize(0)); + CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count)); + CUDF_CUDA_TRY(cudaStreamSynchronize(0)); } CATCH_STD(env, ); } @@ -124,7 +125,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(, cudaMemsetAsync((void *)dst, value, count)); + CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count)); } CATCH_STD(env, ); } @@ -133,7 +134,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); jint dev; - JNI_CUDA_TRY(-2, cudaGetDevice(&dev)); + CUDF_CUDA_TRY(cudaGetDevice(&dev)); return dev; } CATCH_STD(env, -2); @@ -143,7 +144,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla try { cudf::jni::auto_set_device(env); jint count; - JNI_CUDA_TRY(-2, cudaGetDeviceCount(&count)); + CUDF_CUDA_TRY(cudaGetDeviceCount(&count)); return count; } CATCH_STD(env, -2); @@ -155,7 +156,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init"); } - JNI_CUDA_TRY(, cudaSetDevice(dev)); + CUDF_CUDA_TRY(cudaSetDevice(dev)); } CATCH_STD(env, ); } @@ -171,7 +172,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); jint driver_version; - JNI_CUDA_TRY(-2, cudaDriverGetVersion(&driver_version)); + CUDF_CUDA_TRY(cudaDriverGetVersion(&driver_version)); return driver_version; } CATCH_STD(env, -2); @@ -181,7 +182,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j try { cudf::jni::auto_set_device(env); jint runtime_version; - JNI_CUDA_TRY(-2, cudaRuntimeGetVersion(&runtime_version)); + CUDF_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } CATCH_STD(env, -2); @@ -191,9 +192,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(-2, cudaGetDevice(&device)); + CUDF_CUDA_TRY(cudaGetDevice(&device)); cudaDeviceProp device_prop; - JNI_CUDA_TRY(-2, cudaGetDeviceProperties(&device_prop, device)); + CUDF_CUDA_TRY(cudaGetDeviceProperties(&device_prop, device)); return device_prop.computeMode; } CATCH_STD(env, -2); @@ -203,10 +204,10 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(-2, ::cudaGetDevice(&device)); + CUDF_CUDA_TRY(::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY(-2, ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, - device)); + CUDF_CUDA_TRY( + ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device)); return attribute_value; } CATCH_STD(env, -2); @@ -216,10 +217,10 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(-2, ::cudaGetDevice(&device)); + CUDF_CUDA_TRY(::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY(-2, ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, - device)); + CUDF_CUDA_TRY( + ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device)); return attribute_value; } CATCH_STD(env, -2); @@ -228,7 +229,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(, cudaFree(0)); + CUDF_CUDA_TRY(cudaFree(0)); } CATCH_STD(env, ); } @@ -239,7 +240,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclas cudf::jni::auto_set_device(env); cudaStream_t stream = nullptr; auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; - JNI_CUDA_TRY(0, cudaStreamCreateWithFlags(&stream, flags)); + CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags)); return reinterpret_cast(stream); } CATCH_STD(env, 0); @@ -249,7 +250,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(, cudaStreamDestroy(stream)); + CUDF_CUDA_TRY(cudaStreamDestroy(stream)); } CATCH_STD(env, ); } @@ -260,7 +261,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jcl cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(, cudaStreamWaitEvent(stream, event, 0)); + CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0)); } CATCH_STD(env, ); } @@ -270,7 +271,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(, cudaStreamSynchronize(stream)); + CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -288,7 +289,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass if (blockingSync) { flags = flags | cudaEventBlockingSync; } - JNI_CUDA_TRY(0, cudaEventCreateWithFlags(&event, flags)); + CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags)); return reinterpret_cast(event); } CATCH_STD(env, 0); @@ -298,7 +299,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(, cudaEventDestroy(event)); + CUDF_CUDA_TRY(cudaEventDestroy(event)); } CATCH_STD(env, ); } @@ -313,7 +314,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jcla } else if (result == cudaErrorNotReady) { return false; } // else - JNI_CUDA_TRY(false, result); + CUDF_CUDA_TRY(result); } CATCH_STD(env, false); return false; @@ -325,7 +326,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(, cudaEventRecord(event, stream)); + CUDF_CUDA_TRY(cudaEventRecord(event, stream)); } CATCH_STD(env, ); } @@ -335,7 +336,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(, cudaEventSynchronize(event)); + CUDF_CUDA_TRY(cudaEventSynchronize(event)); } CATCH_STD(env, ); } @@ -354,8 +355,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(, cudaMemcpyAsync(dst, src, count, kind, stream)); - JNI_CUDA_TRY(, cudaStreamSynchronize(stream)); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream)); + CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -374,7 +375,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(, cudaMemcpyAsync(dst, src, count, kind, stream)); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream)); } CATCH_STD(env, ); } diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 58123789af2..a5f3ff931b3 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -329,8 +330,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j try { // make sure the CUDA device is setup in the context - JNI_CUDA_CHECK(cudaFree(0)); - JNI_CUDA_CHECK(cudaGetDevice(&device_id)); + CUDF_CUDA_TRY(cudaFree(0)); + CUDF_CUDA_TRY(cudaGetDevice(&device_id)); } CATCH_CUDA_ERROR_AND_THROW(env, ); From 7c8a34ad8bf2973e4128cee5c93b0ad4909eff35 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 20 Apr 2022 15:46:14 +0800 Subject: [PATCH 20/26] update --- java/src/main/native/include/jni_utils.hpp | 70 ++++++++++++++-------- java/src/main/native/src/CudaJni.cpp | 18 +++--- java/src/main/native/src/RmmJni.cpp | 12 ++-- 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 003a58bb6fe..58c176d657b 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -734,8 +734,49 @@ class native_jstringArray { } }; -inline void throw_jni_exception(const char *msg) { - throw jni_exception(msg); +/** + * @brief create a cuda exception from a given cudaError_t + */ +inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { + const char *ex_class_name; + + cudaGetLastError(); + auto const last = cudaGetLastError(); + if (status == last && last == cudaDeviceSynchronize()) { + ex_class_name = cudf::jni::CUDA_FATAL_ERROR_CLASS; + } else { + ex_class_name = cudf::jni::CUDA_ERROR_CLASS; + } + + jclass ex_class = env->FindClass(ex_class_name); + if (ex_class == NULL) { + return NULL; + } + jmethodID ctor_id = + env->GetMethodID(ex_class, "", "(Ljava/lang/String;ILjava/lang/Throwable;)V"); + if (ctor_id == NULL) { + return NULL; + } + + jstring msg = env->NewStringUTF(cudaGetErrorString(status)); + if (msg == NULL) { + return NULL; + } + + jint err_code = static_cast(status); + + jobject ret = env->NewObject(ex_class, ctor_id, msg, err_code, cause); + return (jthrowable)ret; +} + +inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { + if (cudaSuccess != cuda_status) { + jthrowable jt = cuda_exception(env, cuda_status); + if (jt != NULL) { + env->Throw(jt); + throw jni_exception("CUDA ERROR"); + } + } } } // namespace jni @@ -776,19 +817,8 @@ inline void throw_jni_exception(const char *msg) { } \ jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ env->Throw((jthrowable)cuda_error); \ - } - -#define JNI_CHECK_CUDA_ERROR_RETURN(env, class_name, e, ret_val) \ - do { \ - JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val); \ return ret_val; \ - } while (0) - -#define JNI_CHECK_CUDA_ERROR_THROW(env, class_name, e, ret_val) \ - do { \ - JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val); \ - cudf::jni::throw_jni_exception("CUDA ERROR"); \ - } while (0) + } #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ { \ @@ -811,14 +841,6 @@ inline void throw_jni_exception(const char *msg) { } \ } -#define CATCH_CUDA_ERROR_AND_THROW(env, ret_val) \ - catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_CUDA_ERROR_THROW(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ - } \ - catch (const cudf::cuda_error &e) { \ - JNI_CHECK_CUDA_ERROR_THROW(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ - } - #define CATCH_STD_CLASS(env, class_name, ret_val) \ catch (const rmm::out_of_memory &e) { \ auto what = \ @@ -826,10 +848,10 @@ inline void throw_jni_exception(const char *msg) { JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ catch (const cudf::fatal_cuda_error &e) { \ - JNI_CHECK_CUDA_ERROR_RETURN(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ + JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ } \ catch (const cudf::cuda_error &e) { \ - JNI_CHECK_CUDA_ERROR_RETURN(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ + JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 9227c036f3c..926521c55f9 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -42,23 +42,19 @@ void set_cudf_device(int device) { * is using the same device. */ void auto_set_device(JNIEnv *env) { - try { - if (Cudf_device != cudaInvalidDeviceId) { - if (Thread_device != Cudf_device) { - CUDF_CUDA_TRY(cudaSetDevice(Cudf_device)); - Thread_device = Cudf_device; - } + if (Cudf_device != cudaInvalidDeviceId) { + if (Thread_device != Cudf_device) { + cudaError_t cuda_status = cudaSetDevice(Cudf_device); + jni_cuda_check(env, cuda_status); + Thread_device = Cudf_device; } } - CATCH_CUDA_ERROR_AND_THROW(env, ); } /** Fills all the bytes in the buffer 'buf' with 'value'. */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) { - try { - CUDF_CUDA_TRY(cudaMemsetAsync((void *)buf.data(), value, buf.size())); - } - CATCH_CUDA_ERROR_AND_THROW(env, ); + cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size()); + jni_cuda_check(env, cuda_status); } } // namespace jni diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index a5f3ff931b3..5dfd39b60de 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -326,16 +326,14 @@ extern "C" { JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz, jint allocation_mode, jint log_to, jstring jpath, jlong pool_size) { - int device_id; - try { // make sure the CUDA device is setup in the context - CUDF_CUDA_TRY(cudaFree(0)); - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); - } - CATCH_CUDA_ERROR_AND_THROW(env, ); + cudaError_t cuda_status = cudaFree(0); + cudf::jni::jni_cuda_check(env, cuda_status); + int device_id; + cuda_status = cudaGetDevice(&device_id); + cudf::jni::jni_cuda_check(env, cuda_status); - try { bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; bool use_arena_alloc = allocation_mode & 4; From 5cb197705ced7648ee56056d4bc4aec0ef34d231 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 20 Apr 2022 15:52:07 +0800 Subject: [PATCH 21/26] update --- java/src/main/native/include/jni_utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 58c176d657b..50d1f9064be 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -740,8 +740,12 @@ class native_jstringArray { inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { const char *ex_class_name; + // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second + // call doesn't return with cudaSuccess. cudaGetLastError(); auto const last = cudaGetLastError(); + // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. + // between two calls. if (status == last && last == cudaDeviceSynchronize()) { ex_class_name = cudf::jni::CUDA_FATAL_ERROR_CLASS; } else { From 6ba91901d3639017238cd6f269f82989097aec68 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 21 Apr 2022 10:52:04 +0800 Subject: [PATCH 22/26] update --- java/src/main/native/include/jni_utils.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 50d1f9064be..e1333aa2319 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -779,6 +779,9 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { if (jt != NULL) { env->Throw(jt); throw jni_exception("CUDA ERROR"); + } else { + throw jni_exception(std::string("CUDA ERROR: code ") + + std::to_string(static_cast(cuda_status))); } } } @@ -815,11 +818,17 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { jstring j_msg = env->NewStringUTF(n_msg.c_str()); \ jint e_code = static_cast(e.error_code()); \ jclass ex_class = env->FindClass(class_name); \ + if (ex_class == NULL) { \ + return ret_val; \ + } \ jmethodID ctor_id = env->GetMethodID(ex_class, "", "(Ljava/lang/String;I)V"); \ if (ctor_id == NULL) { \ return ret_val; \ } \ jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ + if (cuda_error == NULL) { \ + return ret_val; \ + } \ env->Throw((jthrowable)cuda_error); \ return ret_val; \ } From 16c5e70aac43674246a1f041d8d96bbfa63c1baa Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Thu, 21 Apr 2022 10:56:23 +0800 Subject: [PATCH 23/26] update --- java/src/main/native/src/RmmJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 5dfd39b60de..ce3e6ffb285 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include From 907c67e9ebea5a17335f808fa5f52cafb5c85b9d Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Fri, 22 Apr 2022 09:54:48 +0800 Subject: [PATCH 24/26] Update java/src/main/native/include/jni_utils.hpp Co-authored-by: Jason Lowe --- java/src/main/native/include/jni_utils.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index e1333aa2319..fcf1cbc4b0b 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -778,11 +778,9 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { jthrowable jt = cuda_exception(env, cuda_status); if (jt != NULL) { env->Throw(jt); - throw jni_exception("CUDA ERROR"); - } else { - throw jni_exception(std::string("CUDA ERROR: code ") + - std::to_string(static_cast(cuda_status))); } + throw jni_exception(std::string("CUDA ERROR: code ") + + std::to_string(static_cast(cuda_status))); } } From e313a2a6227b550b5f94dfa2ad87e9f33e7a8d03 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Fri, 22 Apr 2022 09:55:00 +0800 Subject: [PATCH 25/26] Update java/src/main/native/include/jni_utils.hpp Co-authored-by: Jason Lowe --- java/src/main/native/include/jni_utils.hpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index fcf1cbc4b0b..96b28bdb5a7 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -816,18 +816,15 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { jstring j_msg = env->NewStringUTF(n_msg.c_str()); \ jint e_code = static_cast(e.error_code()); \ jclass ex_class = env->FindClass(class_name); \ - if (ex_class == NULL) { \ - return ret_val; \ - } \ - jmethodID ctor_id = env->GetMethodID(ex_class, "", "(Ljava/lang/String;I)V"); \ - if (ctor_id == NULL) { \ - return ret_val; \ - } \ - jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ - if (cuda_error == NULL) { \ - return ret_val; \ - } \ - env->Throw((jthrowable)cuda_error); \ + if (ex_class != NULL) { \ + jmethodID ctor_id = env->GetMethodID(ex_class, "", "(Ljava/lang/String;I)V"); \ + if (ctor_id != NULL) { \ + jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ + if (cuda_error != NULL) { \ + env->Throw((jthrowable)cuda_error); \ + } \ + } \ + } \ return ret_val; \ } From 50bfc2cd0a974cbe035d90158e7de2226ce469c3 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Fri, 22 Apr 2022 10:38:20 +0800 Subject: [PATCH 26/26] fix --- java/src/main/native/include/jni_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 96b28bdb5a7..eca424132a5 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -824,7 +824,7 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { env->Throw((jthrowable)cuda_error); \ } \ } \ - } \ + } \ return ret_val; \ }