rapidsai · rapids-bot · Apr 24, 2022 · Mar 31, 2022 · Mar 31, 2022 · Mar 31, 2022
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,11 +28,150 @@
  * don't switch between threads for different parts of processing that can be retried as a chunk.
  */
 public class CudaException extends RuntimeException {
-  CudaException(String message) {
+  CudaException(String message, String cudaErrorName) {
     super(message);
+    cudaError = CudaError.valueOf(cudaErrorName);
   }
 
-  CudaException(String message, Throwable cause) {
+  CudaException(String message, String cudaErrorName, Throwable cause) {
     super(message, cause);
+    cudaError = CudaError.valueOf(cudaErrorName);
+  }
+
+  public final CudaError cudaError;
+
+  /**
+   * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
+   */
+  public enum CudaError {
+    cudaErrorInvalidValue(1),
+    cudaErrorMemoryAllocation(2),
+    cudaErrorInitializationError(3),
+    cudaErrorCudartUnloading(4),
+    cudaErrorProfilerDisabled(5),
+    cudaErrorProfilerNotInitialized(6),
+    cudaErrorProfilerAlreadyStarted(7),
+    cudaErrorProfilerAlreadyStopped(8),
+    cudaErrorInvalidConfiguration(9),
+    cudaErrorInvalidPitchValue(12),
+    cudaErrorInvalidSymbol(13),
+    cudaErrorInvalidHostPointer(16),
+    cudaErrorInvalidDevicePointer(17),
+    cudaErrorInvalidTexture(18),
+    cudaErrorInvalidTextureBinding(19),
+    cudaErrorInvalidChannelDescriptor(20),
+    cudaErrorInvalidMemcpyDirection(21),
+    cudaErrorAddressOfConstant(22),
+    cudaErrorTextureFetchFailed(23),
+    cudaErrorTextureNotBound(24),
+    cudaErrorSynchronizationError(25),
+    cudaErrorInvalidFilterSetting(26),
+    cudaErrorInvalidNormSetting(27),
+    cudaErrorMixedDeviceExecution(28),
+    cudaErrorNotYetImplemented(31),
+    cudaErrorMemoryValueTooLarge(32),
+    cudaErrorStubLibrary(34),
+    cudaErrorInsufficientDriver(35),
+    cudaErrorCallRequiresNewerDriver(36),
+    cudaErrorInvalidSurface(37),
+    cudaErrorDuplicateVariableName(43),
+    cudaErrorDuplicateTextureName(44),
+    cudaErrorDuplicateSurfaceName(45),
+    cudaErrorDevicesUnavailable(46),
+    cudaErrorIncompatibleDriverContext(49),
+    cudaErrorMissingConfiguration(52),
+    cudaErrorPriorLaunchFailure(53),
+    cudaErrorLaunchMaxDepthExceeded(65),
+    cudaErrorLaunchFileScopedTex(66),
+    cudaErrorLaunchFileScopedSurf(67),
+    cudaErrorSyncDepthExceeded(68),
+    cudaErrorLaunchPendingCountExceeded(69),
+    cudaErrorInvalidDeviceFunction(98),
+    cudaErrorNoDevice(100),
+    cudaErrorInvalidDevice(101),
+    cudaErrorDeviceNotLicensed(102),
+    cudaErrorSoftwareValidityNotEstablished(103),
+    cudaErrorStartupFailure(127),
+    cudaErrorInvalidKernelImage(200),
+    cudaErrorDeviceUninitialized(201),
+    cudaErrorMapBufferObjectFailed(205),
+    cudaErrorUnmapBufferObjectFailed(206),
+    cudaErrorArrayIsMapped(207),
+    cudaErrorAlreadyMapped(208),
+    cudaErrorNoKernelImageForDevice(209),
+    cudaErrorAlreadyAcquired(210),
+    cudaErrorNotMapped(211),
+    cudaErrorNotMappedAsArray(212),
+    cudaErrorNotMappedAsPointer(213),
+    cudaErrorECCUncorrectable(214),
+    cudaErrorUnsupportedLimit(215),
+    cudaErrorDeviceAlreadyInUse(216),
+    cudaErrorPeerAccessUnsupported(217),
+    cudaErrorInvalidPtx(218),
+    cudaErrorInvalidGraphicsContext(219),
+    cudaErrorNvlinkUncorrectable(220),
+    cudaErrorJitCompilerNotFound(221),
+    cudaErrorUnsupportedPtxVersion(222),
+    cudaErrorJitCompilationDisabled(223),
+    cudaErrorUnsupportedExecAffinity(224),
+    cudaErrorInvalidSource(300),
+    cudaErrorFileNotFound(301),
+    cudaErrorSharedObjectSymbolNotFound(302),
+    cudaErrorSharedObjectInitFailed(303),
+    cudaErrorOperatingSystem(304),
+    cudaErrorInvalidResourceHandle(400),
+    cudaErrorIllegalState(401),
+    cudaErrorSymbolNotFound(500),
+    cudaErrorNotReady(600),
+    cudaErrorIllegalAddress(700),
+    cudaErrorLaunchOutOfResources(701),
+    cudaErrorLaunchTimeout(702),
+    cudaErrorLaunchIncompatibleTexturing(703),
+    cudaErrorPeerAccessAlreadyEnabled(704),
+    cudaErrorPeerAccessNotEnabled(705),
+    cudaErrorSetOnActiveProcess(708),
+    cudaErrorContextIsDestroyed(709),
+    cudaErrorAssert(710),
+    cudaErrorTooManyPeers(711),
+    cudaErrorHostMemoryAlreadyRegistered(712),
+    cudaErrorHostMemoryNotRegistered(713),
+    cudaErrorHardwareStackError(714),
+    cudaErrorIllegalInstruction(715),
+    cudaErrorMisalignedAddress(716),
+    cudaErrorInvalidAddressSpace(717),
+    cudaErrorInvalidPc(718),
+    cudaErrorLaunchFailure(719),
+    cudaErrorCooperativeLaunchTooLarge(720),
+    cudaErrorNotPermitted(800),
+    cudaErrorNotSupported(801),
+    cudaErrorSystemNotReady(802),
+    cudaErrorSystemDriverMismatch(803),
+    cudaErrorCompatNotSupportedOnDevice(804),
+    cudaErrorMpsConnectionFailed(805),
+    cudaErrorMpsRpcFailure(806),
+    cudaErrorMpsServerNotReady(807),
+    cudaErrorMpsMaxClientsReached(808),
+    cudaErrorMpsMaxConnectionsReached(809),
+    cudaErrorStreamCaptureUnsupported(900),
+    cudaErrorStreamCaptureInvalidated(901),
+    cudaErrorStreamCaptureMerge(902),
+    cudaErrorStreamCaptureUnmatched(903),
+    cudaErrorStreamCaptureUnjoined(904),
+    cudaErrorStreamCaptureIsolation(905),
+    cudaErrorStreamCaptureImplicit(906),
+    cudaErrorCapturedEvent(907),
+    cudaErrorStreamCaptureWrongThread(908),
+    cudaErrorTimeout(909),
+    cudaErrorGraphExecUpdateFailure(910),
+    cudaErrorExternalDevice(911),
+    cudaErrorUnknown(999),
+    cudaErrorApiFailureBase(10000);
+
+    final int code;
+
+    CudaError(int errorCode) {
+      this.code = errorCode;
+    }
+
   }
 }
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+/**
+ * CudaFatalException is a kind of CudaException which leaves the process in an inconsistent state
+ * and any further CUDA work will return the same error.
+ * To continue using CUDA, the process must be terminated and relaunched.
+ */
+public class CudaFatalException extends CudaException {
+  CudaFatalException(String message, String cudaErrorName) {
+    super(message, cudaErrorName);
+  }
+
+  CudaFatalException(String message, String cudaErrorName, Throwable cause) {
+    super(message, cudaErrorName, cause);
+  }
+}
@@ -30,6 +30,7 @@ namespace jni {
 constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 
 constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
+constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
 constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
 constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
@@ -733,41 +734,6 @@ class native_jstringArray {
   }
 };
 
-/**
- * @brief create a cuda exception from a given cudaError_t
- */
-inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
-  jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS);
-  if (ex_class == NULL) {
-    return NULL;
-  }
-  jmethodID ctor_id =
-      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/Throwable;)V");
-  if (ctor_id == NULL) {
-    return NULL;
-  }
-
-  jstring msg = env->NewStringUTF(cudaGetErrorString(status));
-  if (msg == NULL) {
-    return NULL;
-  }
-
-  jobject ret = env->NewObject(ex_class, ctor_id, msg, cause);
-  return (jthrowable)ret;
-}
-
-inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
-  if (cudaSuccess != cuda_status) {
-    // Clear the last error so it does not propagate.
-    cudaGetLastError();
-    jthrowable jt = cuda_exception(env, cuda_status);
-    if (jt != NULL) {
-      env->Throw(jt);
-      throw jni_exception("CUDA ERROR");
-    }
-  }
-}
-
 } // namespace jni
 } // namespace cudf
 
@@ -790,19 +756,42 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     JNI_THROW_NEW(env, class_name, message, ret_val)                                               \
   }
 
+// Throw a new exception only if one is not pending then always return with the specified value
+#define JNI_CHECK_THROW_NEW_CUDA_ERROR(env, class_name, e, ret_val)                                \
+  do {                                                                                             \
+    if (env->ExceptionOccurred()) {                                                                \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    std::string n_msg = std::string("CUDA ERROR: ") + (e.what() == nullptr ? "" : e.what());       \
+    jstring j_msg = env->NewStringUTF(n_msg.c_str());                                              \
+    const char *n_name = cudaGetErrorName(e.error_code());                                         \
+    jstring j_name = env->NewStringUTF(n_name);                                                    \
+    jclass ex_class = env->FindClass(class_name);                                                  \
+    jmethodID ctor_id =                                                                            \
+        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");           \
+    if (ctor_id == NULL) {                                                                         \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, j_name);                         \
+    env->Throw((jthrowable)cuda_error);                                                            \
+    return ret_val;                                                                                \
+  } while (0)
+
 #define JNI_CUDA_TRY(env, ret_val, call)                                                           \
-  {                                                                                                \
+  do {                                                                                             \
     cudaError_t internal_cuda_status = (call);                                                     \
     if (cudaSuccess != internal_cuda_status) {                                                     \
-      /* Clear the last error so it does not propagate.*/                                          \
-      cudaGetLastError();                                                                          \
-      jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status);                        \
-      if (jt != NULL) {                                                                            \
-        env->Throw(jt);                                                                            \
-      }                                                                                            \
+      cudf::detail::throw_cuda_error(internal_cuda_status, __FILE__, __LINE__);                    \
       return ret_val;                                                                              \
     }                                                                                              \
-  }
+  } while (0)
+
+#define JNI_CUDA_CHECK(env, cuda_status)                                                           \
+  do {                                                                                             \
+    if (cudaSuccess != cuda_status) {                                                              \
+      cudf::detail::throw_cuda_error(cuda_status, __FILE__, __LINE__);                             \
+    }                                                                                              \
+  } while (0)
 
 #define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
   {                                                                                                \
@@ -831,6 +820,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
   }                                                                                                \
+  catch (const cudf::fatal_cuda_error &e) {                                                        \
+    JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val);            \
+  }                                                                                                \
+  catch (const cudf::cuda_error &e) {                                                              \
+    JNI_CHECK_THROW_NEW_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val);                  \
+  }                                                                                                \
   catch (const std::exception &e) {                                                                \
     /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
     JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \

@@ -41,19 +41,23 @@ void set_cudf_device(int device) {
  * is using the same device.
  */
 void auto_set_device(JNIEnv *env) {
-  if (Cudf_device != cudaInvalidDeviceId) {
-    if (Thread_device != Cudf_device) {
-      cudaError_t cuda_status = cudaSetDevice(Cudf_device);
-      jni_cuda_check(env, cuda_status);
-      Thread_device = Cudf_device;
+  try {
+    if (Cudf_device != cudaInvalidDeviceId) {
+      if (Thread_device != Cudf_device) {
+        JNI_CUDA_CHECK(env, cudaSetDevice(Cudf_device));
+        Thread_device = Cudf_device;
+      }
     }
   }
+  CATCH_STD(env, );
 }
 
 /** Fills all the bytes in the buffer 'buf' with 'value'. */
 void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
-  cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
-  jni_cuda_check(env, cuda_status);
+  try {
+    JNI_CUDA_CHECK(env, cudaMemsetAsync((void *)buf.data(), value, buf.size()));
+  }
+  CATCH_STD(env, );
 }
 
 } // namespace jni

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
@@ -327,11 +327,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
                                                                   jstring jpath, jlong pool_size) {
   try {
     // make sure the CUDA device is setup in the context
-    cudaError_t cuda_status = cudaFree(0);
-    cudf::jni::jni_cuda_check(env, cuda_status);
+    JNI_CUDA_CHECK(env, cudaFree(0));
     int device_id;
-    cuda_status = cudaGetDevice(&device_id);
-    cudf::jni::jni_cuda_check(env, cuda_status);
+    JNI_CUDA_CHECK(env, cudaGetDevice(&device_id));
 
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;
 
 public class CudaTest {
 
@@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Test
+  public void testCudaException() {
+    assertThrows(CudaException.class, () -> {
+          try {
+            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
+          } catch (CudaFatalException ignored) {
+          } catch (CudaException ex) {
+            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
+            throw ex;
+          }
+        }
+    );
+  }
 }