Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JNI: throw CUDA errors more specifically #10551

Merged
merged 29 commits into from
Apr 24, 2022
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 142 additions & 3 deletions java/src/main/java/ai/rapids/cudf/CudaException.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,11 +28,150 @@
* don't switch between threads for different parts of processing that can be retried as a chunk.
*/
public class CudaException extends RuntimeException {
CudaException(String message) {
CudaException(String message, String cudaErrorName) {
super(message);
cudaError = CudaError.valueOf(cudaErrorName);
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}

CudaException(String message, Throwable cause) {
CudaException(String message, String cudaErrorName, Throwable cause) {
super(message, cause);
cudaError = CudaError.valueOf(cudaErrorName);
}

public final CudaError cudaError;

/**
* The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
*/
public enum CudaError {
cudaErrorInvalidValue(1),
cudaErrorMemoryAllocation(2),
cudaErrorInitializationError(3),
cudaErrorCudartUnloading(4),
cudaErrorProfilerDisabled(5),
cudaErrorProfilerNotInitialized(6),
cudaErrorProfilerAlreadyStarted(7),
cudaErrorProfilerAlreadyStopped(8),
cudaErrorInvalidConfiguration(9),
cudaErrorInvalidPitchValue(12),
cudaErrorInvalidSymbol(13),
cudaErrorInvalidHostPointer(16),
cudaErrorInvalidDevicePointer(17),
cudaErrorInvalidTexture(18),
cudaErrorInvalidTextureBinding(19),
cudaErrorInvalidChannelDescriptor(20),
cudaErrorInvalidMemcpyDirection(21),
cudaErrorAddressOfConstant(22),
cudaErrorTextureFetchFailed(23),
cudaErrorTextureNotBound(24),
cudaErrorSynchronizationError(25),
cudaErrorInvalidFilterSetting(26),
cudaErrorInvalidNormSetting(27),
cudaErrorMixedDeviceExecution(28),
cudaErrorNotYetImplemented(31),
cudaErrorMemoryValueTooLarge(32),
cudaErrorStubLibrary(34),
cudaErrorInsufficientDriver(35),
cudaErrorCallRequiresNewerDriver(36),
cudaErrorInvalidSurface(37),
cudaErrorDuplicateVariableName(43),
cudaErrorDuplicateTextureName(44),
cudaErrorDuplicateSurfaceName(45),
cudaErrorDevicesUnavailable(46),
cudaErrorIncompatibleDriverContext(49),
cudaErrorMissingConfiguration(52),
cudaErrorPriorLaunchFailure(53),
cudaErrorLaunchMaxDepthExceeded(65),
cudaErrorLaunchFileScopedTex(66),
cudaErrorLaunchFileScopedSurf(67),
cudaErrorSyncDepthExceeded(68),
cudaErrorLaunchPendingCountExceeded(69),
cudaErrorInvalidDeviceFunction(98),
cudaErrorNoDevice(100),
cudaErrorInvalidDevice(101),
cudaErrorDeviceNotLicensed(102),
cudaErrorSoftwareValidityNotEstablished(103),
cudaErrorStartupFailure(127),
cudaErrorInvalidKernelImage(200),
cudaErrorDeviceUninitialized(201),
cudaErrorMapBufferObjectFailed(205),
cudaErrorUnmapBufferObjectFailed(206),
cudaErrorArrayIsMapped(207),
cudaErrorAlreadyMapped(208),
cudaErrorNoKernelImageForDevice(209),
cudaErrorAlreadyAcquired(210),
cudaErrorNotMapped(211),
cudaErrorNotMappedAsArray(212),
cudaErrorNotMappedAsPointer(213),
cudaErrorECCUncorrectable(214),
cudaErrorUnsupportedLimit(215),
cudaErrorDeviceAlreadyInUse(216),
cudaErrorPeerAccessUnsupported(217),
cudaErrorInvalidPtx(218),
cudaErrorInvalidGraphicsContext(219),
cudaErrorNvlinkUncorrectable(220),
cudaErrorJitCompilerNotFound(221),
cudaErrorUnsupportedPtxVersion(222),
cudaErrorJitCompilationDisabled(223),
cudaErrorUnsupportedExecAffinity(224),
cudaErrorInvalidSource(300),
cudaErrorFileNotFound(301),
cudaErrorSharedObjectSymbolNotFound(302),
cudaErrorSharedObjectInitFailed(303),
cudaErrorOperatingSystem(304),
cudaErrorInvalidResourceHandle(400),
cudaErrorIllegalState(401),
cudaErrorSymbolNotFound(500),
cudaErrorNotReady(600),
cudaErrorIllegalAddress(700),
cudaErrorLaunchOutOfResources(701),
cudaErrorLaunchTimeout(702),
cudaErrorLaunchIncompatibleTexturing(703),
cudaErrorPeerAccessAlreadyEnabled(704),
cudaErrorPeerAccessNotEnabled(705),
cudaErrorSetOnActiveProcess(708),
cudaErrorContextIsDestroyed(709),
cudaErrorAssert(710),
cudaErrorTooManyPeers(711),
cudaErrorHostMemoryAlreadyRegistered(712),
cudaErrorHostMemoryNotRegistered(713),
cudaErrorHardwareStackError(714),
cudaErrorIllegalInstruction(715),
cudaErrorMisalignedAddress(716),
cudaErrorInvalidAddressSpace(717),
cudaErrorInvalidPc(718),
cudaErrorLaunchFailure(719),
cudaErrorCooperativeLaunchTooLarge(720),
cudaErrorNotPermitted(800),
cudaErrorNotSupported(801),
cudaErrorSystemNotReady(802),
cudaErrorSystemDriverMismatch(803),
cudaErrorCompatNotSupportedOnDevice(804),
cudaErrorMpsConnectionFailed(805),
cudaErrorMpsRpcFailure(806),
cudaErrorMpsServerNotReady(807),
cudaErrorMpsMaxClientsReached(808),
cudaErrorMpsMaxConnectionsReached(809),
cudaErrorStreamCaptureUnsupported(900),
cudaErrorStreamCaptureInvalidated(901),
cudaErrorStreamCaptureMerge(902),
cudaErrorStreamCaptureUnmatched(903),
cudaErrorStreamCaptureUnjoined(904),
cudaErrorStreamCaptureIsolation(905),
cudaErrorStreamCaptureImplicit(906),
cudaErrorCapturedEvent(907),
cudaErrorStreamCaptureWrongThread(908),
cudaErrorTimeout(909),
cudaErrorGraphExecUpdateFailure(910),
cudaErrorExternalDevice(911),
cudaErrorUnknown(999),
cudaErrorApiFailureBase(10000);

final int code;

CudaError(int errorCode) {
this.code = errorCode;
}

}
}
31 changes: 31 additions & 0 deletions java/src/main/java/ai/rapids/cudf/CudaFatalException.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.rapids.cudf;

/**
* CudaFatalException is a kind of CudaException which leaves the process in an inconsistent state
* and any further CUDA work will return the same error.
* To continue using CUDA, the process must be terminated and relaunched.
*/
public class CudaFatalException extends CudaException {
CudaFatalException(String message, String cudaErrorName) {
super(message, cudaErrorName);
}

CudaFatalException(String message, String cudaErrorName, Throwable cause) {
super(message, cudaErrorName, cause);
}
}
81 changes: 38 additions & 43 deletions java/src/main/native/include/jni_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ namespace jni {
constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;

constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
Expand Down Expand Up @@ -733,41 +734,6 @@ class native_jstringArray {
}
};

/**
* @brief create a cuda exception from a given cudaError_t
*/
inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS);
if (ex_class == NULL) {
return NULL;
}
jmethodID ctor_id =
env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/Throwable;)V");
if (ctor_id == NULL) {
return NULL;
}

jstring msg = env->NewStringUTF(cudaGetErrorString(status));
if (msg == NULL) {
return NULL;
}

jobject ret = env->NewObject(ex_class, ctor_id, msg, cause);
return (jthrowable)ret;
}

inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
if (cudaSuccess != cuda_status) {
// Clear the last error so it does not propagate.
cudaGetLastError();
jthrowable jt = cuda_exception(env, cuda_status);
if (jt != NULL) {
env->Throw(jt);
throw jni_exception("CUDA ERROR");
}
}
}

} // namespace jni
} // namespace cudf

Expand All @@ -790,19 +756,42 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
JNI_THROW_NEW(env, class_name, message, ret_val) \
}

// Throw a new exception only if one is not pending then always return with the specified value
#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, ret_val) \
do { \
if (env->ExceptionOccurred()) { \
return ret_val; \
} \
std::string n_msg = std::string("CUDA ERROR: ") + (e.what() == nullptr ? "" : e.what()); \
jlowe marked this conversation as resolved.
Show resolved Hide resolved
jstring j_msg = env->NewStringUTF(n_msg.c_str()); \
const char *n_name = cudaGetErrorName(e.error_code()); \
jstring j_name = env->NewStringUTF(n_name); \
jclass ex_class = env->FindClass(class_name); \
jlowe marked this conversation as resolved.
Show resolved Hide resolved
jmethodID ctor_id = \
env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V"); \
if (ctor_id == NULL) { \
return ret_val; \
} \
jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, j_name); \
env->Throw((jthrowable)cuda_error); \
jlowe marked this conversation as resolved.
Show resolved Hide resolved
return ret_val; \
} while (0)

#define JNI_CUDA_TRY(env, ret_val, call) \
{ \
do { \
cudaError_t internal_cuda_status = (call); \
if (cudaSuccess != internal_cuda_status) { \
/* Clear the last error so it does not propagate.*/ \
cudaGetLastError(); \
jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \
if (jt != NULL) { \
env->Throw(jt); \
} \
cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__); \
return ret_val; \
} \
}
} while (0)

#define JNI_CUDA_CHECK(env, cuda_status) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best practice would be to put this in a do{...} while(0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed. Thanks for advice!

jlowe marked this conversation as resolved.
Show resolved Hide resolved
do { \
if (cudaSuccess != cuda_status) { \
cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__); \
} \
} while (0)

#define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \
{ \
Expand Down Expand Up @@ -831,6 +820,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \
} \
catch (const cudf::fatal_cuda_error &e) { \
JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \
} \
catch (const cudf::cuda_error &e) { \
JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \
} \
catch (const std::exception &e) { \
/* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \
Expand Down
18 changes: 11 additions & 7 deletions java/src/main/native/src/CudaJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,23 @@ void set_cudf_device(int device) {
* is using the same device.
*/
void auto_set_device(JNIEnv *env) {
if (Cudf_device != cudaInvalidDeviceId) {
if (Thread_device != Cudf_device) {
cudaError_t cuda_status = cudaSetDevice(Cudf_device);
jni_cuda_check(env, cuda_status);
Thread_device = Cudf_device;
try {
if (Cudf_device != cudaInvalidDeviceId) {
if (Thread_device != Cudf_device) {
JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device));
Thread_device = Cudf_device;
}
}
}
CATCH_STD(env, );
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}

/** Fills all the bytes in the buffer 'buf' with 'value'. */
void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
jni_cuda_check(env, cuda_status);
try {
JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size()));
}
CATCH_STD(env, );
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace jni
Expand Down
6 changes: 2 additions & 4 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
jstring jpath, jlong pool_size) {
try {
// make sure the CUDA device is setup in the context
cudaError_t cuda_status = cudaFree(0);
cudf::jni::jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cudaFree(0));
int device_id;
cuda_status = cudaGetDevice(&device_id);
cudf::jni::jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cudaGetDevice(&device_id));

bool use_pool_alloc = allocation_mode & 1;
bool use_managed_mem = allocation_mode & 2;
Expand Down
17 changes: 15 additions & 2 deletions java/src/test/java/ai/rapids/cudf/CudaTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,7 +18,7 @@

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.*;

public class CudaTest {

Expand All @@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() {
assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
}

@Test
public void testCudaException() {
assertThrows(CudaException.class, () -> {
try {
Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
} catch (CudaFatalException ignored) {
} catch (CudaException ex) {
assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
throw ex;
}
}
);
}
}