rapidsai · rapids-bot · Apr 24, 2022 · Mar 31, 2022 · Mar 31, 2022 · Mar 31, 2022
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@
  */
 package ai.rapids.cudf;
 
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * Exception from the cuda language/library.  Be aware that because of how cuda does asynchronous
  * processing exceptions from cuda can be thrown by method calls that did not cause the exception
@@ -30,9 +33,179 @@
 public class CudaException extends RuntimeException {
   CudaException(String message) {
     super(message);
+    this.cudaError = extractCudaError(message);
   }
 
   CudaException(String message, Throwable cause) {
     super(message, cause);
+    this.cudaError = extractCudaError(message);
+  }
+
+  public final CudaError cudaError;
+
+  /**
+   * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
+   */
+  public enum CudaError {
+    cudaErrorInvalidValue(1),
+    cudaErrorMemoryAllocation(2),
+    cudaErrorInitializationError(3),
+    cudaErrorCudartUnloading(4),
+    cudaErrorProfilerDisabled(5),
+    cudaErrorProfilerNotInitialized(6),
+    cudaErrorProfilerAlreadyStarted(7),
+    cudaErrorProfilerAlreadyStopped(8),
+    cudaErrorInvalidConfiguration(9),
+    cudaErrorInvalidPitchValue(12),
+    cudaErrorInvalidSymbol(13),
+    cudaErrorInvalidHostPointer(16),
+    cudaErrorInvalidDevicePointer(17),
+    cudaErrorInvalidTexture(18),
+    cudaErrorInvalidTextureBinding(19),
+    cudaErrorInvalidChannelDescriptor(20),
+    cudaErrorInvalidMemcpyDirection(21),
+    cudaErrorAddressOfConstant(22),
+    cudaErrorTextureFetchFailed(23),
+    cudaErrorTextureNotBound(24),
+    cudaErrorSynchronizationError(25),
+    cudaErrorInvalidFilterSetting(26),
+    cudaErrorInvalidNormSetting(27),
+    cudaErrorMixedDeviceExecution(28),
+    cudaErrorNotYetImplemented(31),
+    cudaErrorMemoryValueTooLarge(32),
+    cudaErrorStubLibrary(34),
+    cudaErrorInsufficientDriver(35),
+    cudaErrorCallRequiresNewerDriver(36),
+    cudaErrorInvalidSurface(37),
+    cudaErrorDuplicateVariableName(43),
+    cudaErrorDuplicateTextureName(44),
+    cudaErrorDuplicateSurfaceName(45),
+    cudaErrorDevicesUnavailable(46),
+    cudaErrorIncompatibleDriverContext(49),
+    cudaErrorMissingConfiguration(52),
+    cudaErrorPriorLaunchFailure(53),
+    cudaErrorLaunchMaxDepthExceeded(65),
+    cudaErrorLaunchFileScopedTex(66),
+    cudaErrorLaunchFileScopedSurf(67),
+    cudaErrorSyncDepthExceeded(68),
+    cudaErrorLaunchPendingCountExceeded(69),
+    cudaErrorInvalidDeviceFunction(98),
+    cudaErrorNoDevice(100),
+    cudaErrorInvalidDevice(101),
+    cudaErrorDeviceNotLicensed(102),
+    cudaErrorSoftwareValidityNotEstablished(103),
+    cudaErrorStartupFailure(127),
+    cudaErrorInvalidKernelImage(200),
+    cudaErrorDeviceUninitialized(201),
+    cudaErrorMapBufferObjectFailed(205),
+    cudaErrorUnmapBufferObjectFailed(206),
+    cudaErrorArrayIsMapped(207),
+    cudaErrorAlreadyMapped(208),
+    cudaErrorNoKernelImageForDevice(209),
+    cudaErrorAlreadyAcquired(210),
+    cudaErrorNotMapped(211),
+    cudaErrorNotMappedAsArray(212),
+    cudaErrorNotMappedAsPointer(213),
+    cudaErrorECCUncorrectable(214),
+    cudaErrorUnsupportedLimit(215),
+    cudaErrorDeviceAlreadyInUse(216),
+    cudaErrorPeerAccessUnsupported(217),
+    cudaErrorInvalidPtx(218),
+    cudaErrorInvalidGraphicsContext(219),
+    cudaErrorNvlinkUncorrectable(220),
+    cudaErrorJitCompilerNotFound(221),
+    cudaErrorUnsupportedPtxVersion(222),
+    cudaErrorJitCompilationDisabled(223),
+    cudaErrorUnsupportedExecAffinity(224),
+    cudaErrorInvalidSource(300),
+    cudaErrorFileNotFound(301),
+    cudaErrorSharedObjectSymbolNotFound(302),
+    cudaErrorSharedObjectInitFailed(303),
+    cudaErrorOperatingSystem(304),
+    cudaErrorInvalidResourceHandle(400),
+    cudaErrorIllegalState(401),
+    cudaErrorSymbolNotFound(500),
+    cudaErrorNotReady(600),
+    cudaErrorIllegalAddress(700),
+    cudaErrorLaunchOutOfResources(701),
+    cudaErrorLaunchTimeout(702),
+    cudaErrorLaunchIncompatibleTexturing(703),
+    cudaErrorPeerAccessAlreadyEnabled(704),
+    cudaErrorPeerAccessNotEnabled(705),
+    cudaErrorSetOnActiveProcess(708),
+    cudaErrorContextIsDestroyed(709),
+    cudaErrorAssert(710),
+    cudaErrorTooManyPeers(711),
+    cudaErrorHostMemoryAlreadyRegistered(712),
+    cudaErrorHostMemoryNotRegistered(713),
+    cudaErrorHardwareStackError(714),
+    cudaErrorIllegalInstruction(715),
+    cudaErrorMisalignedAddress(716),
+    cudaErrorInvalidAddressSpace(717),
+    cudaErrorInvalidPc(718),
+    cudaErrorLaunchFailure(719),
+    cudaErrorCooperativeLaunchTooLarge(720),
+    cudaErrorNotPermitted(800),
+    cudaErrorNotSupported(801),
+    cudaErrorSystemNotReady(802),
+    cudaErrorSystemDriverMismatch(803),
+    cudaErrorCompatNotSupportedOnDevice(804),
+    cudaErrorMpsConnectionFailed(805),
+    cudaErrorMpsRpcFailure(806),
+    cudaErrorMpsServerNotReady(807),
+    cudaErrorMpsMaxClientsReached(808),
+    cudaErrorMpsMaxConnectionsReached(809),
+    cudaErrorStreamCaptureUnsupported(900),
+    cudaErrorStreamCaptureInvalidated(901),
+    cudaErrorStreamCaptureMerge(902),
+    cudaErrorStreamCaptureUnmatched(903),
+    cudaErrorStreamCaptureUnjoined(904),
+    cudaErrorStreamCaptureIsolation(905),
+    cudaErrorStreamCaptureImplicit(906),
+    cudaErrorCapturedEvent(907),
+    cudaErrorStreamCaptureWrongThread(908),
+    cudaErrorTimeout(909),
+    cudaErrorGraphExecUpdateFailure(910),
+    cudaErrorExternalDevice(911),
+    cudaErrorUnknown(999),
+    cudaErrorApiFailureBase(10000);
+
+    final int code;
+
+    private static final Set<CudaError> stickyErrors = new HashSet<CudaError>(){{
+      add(CudaError.cudaErrorIllegalAddress);
+      add(CudaError.cudaErrorLaunchTimeout);
+      add(CudaError.cudaErrorHardwareStackError);
+      add(CudaError.cudaErrorIllegalInstruction);
+      add(CudaError.cudaErrorMisalignedAddress);
+      add(CudaError.cudaErrorInvalidAddressSpace);
+      add(CudaError.cudaErrorInvalidPc);
+      add(CudaError.cudaErrorLaunchFailure);
+      add(CudaError.cudaErrorExternalDevice);
+      add(CudaError.cudaErrorUnknown);
+    }};
+
+    CudaError(int errorCode) {
+      this.code = errorCode;
+    }
+
+    /**
+     * Returns whether this CudaError is sticky or not.
+     *
+     * Sticky errors leave the process in an inconsistent state and any further CUDA work will return
+     * the same error. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    public boolean isSticky() {
+      return stickyErrors.contains(this);
+    }
+  }
+
+  private static CudaError extractCudaError(String message) {
+    for (String segment : message.split(" ")) {
+      if (segment.startsWith("cudaError")) {
+        return CudaError.valueOf(segment);
+      }
+    }
+    throw new CudfException("invalid CUDA error message: " + message);
   }
 }
@@ -736,7 +736,8 @@ class native_jstringArray {
 /**
  * @brief create a cuda exception from a given cudaError_t
  */
-inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
+inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned int line,
+                                 cudaError_t status, jthrowable cause = NULL) {
   jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS);
   if (ex_class == NULL) {
     return NULL;
@@ -747,25 +748,21 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
     return NULL;
   }
 
-  jstring msg = env->NewStringUTF(cudaGetErrorString(status));
-  if (msg == NULL) {
+  const char *err_name = cudaGetErrorName(status);
+  if (err_name == nullptr) {
     return NULL;
   }
+  const char *err_string = cudaGetErrorString(status);
 
-  jobject ret = env->NewObject(ex_class, ctor_id, msg, cause);
-  return (jthrowable)ret;
-}
+  // Build the error message in the format of cudf::cuda_error, so that cudf::jni::CUDA_ERROR_CLASS
+  // can parse both of them.
+  std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" +
+                      std::to_string(line) + ": " + std::to_string(status) + " " + err_name + " " +
+                      err_string;
+  jstring j_msg = env->NewStringUTF(n_msg.c_str());
 
-inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
-  if (cudaSuccess != cuda_status) {
-    // Clear the last error so it does not propagate.
-    cudaGetLastError();
-    jthrowable jt = cuda_exception(env, cuda_status);
-    if (jt != NULL) {
-      env->Throw(jt);
-      throw jni_exception("CUDA ERROR");
-    }
-  }
+  jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause);
+  return (jthrowable)ret;
 }
 
 } // namespace jni
@@ -796,14 +793,27 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     if (cudaSuccess != internal_cuda_status) {                                                     \
       /* Clear the last error so it does not propagate.*/                                          \
       cudaGetLastError();                                                                          \
-      jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status);                        \
+      jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status);    \
       if (jt != NULL) {                                                                            \
         env->Throw(jt);                                                                            \
       }                                                                                            \
       return ret_val;                                                                              \
     }                                                                                              \
   }
 
+#define JNI_CUDA_CHECK(env, cuda_status)                                                           \
+  {                                                                                                \
+    if (cudaSuccess != cuda_status) {                                                              \
+      /* Clear the last error so it does not propagate.*/                                          \
+      cudaGetLastError();                                                                          \
+      jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status);             \
+      if (jt != NULL) {                                                                            \
+        env->Throw(jt);                                                                            \
+        throw cudf::jni::jni_exception("CUDA ERROR");                                              \
+      }                                                                                            \
+    }                                                                                              \
+  }
+
 #define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
   {                                                                                                \
     if ((obj) == 0) {                                                                              \
@@ -831,6 +841,10 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
   }                                                                                                \
+  catch (const cudf::cuda_error &e) {                                                              \
+    /* For CUDA errors, the specific error code will be extracted from error message. */           \
+    JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val);                      \
+  }                                                                                                \
   catch (const std::exception &e) {                                                                \
     /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
     JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ void auto_set_device(JNIEnv *env) {
   if (Cudf_device != cudaInvalidDeviceId) {
     if (Thread_device != Cudf_device) {
       cudaError_t cuda_status = cudaSetDevice(Cudf_device);
-      jni_cuda_check(env, cuda_status);
+      JNI_CUDA_CHECK(env, cuda_status);
       Thread_device = Cudf_device;
     }
   }
@@ -53,7 +53,7 @@ void auto_set_device(JNIEnv *env) {
 /** Fills all the bytes in the buffer 'buf' with 'value'. */
 void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
   cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
-  jni_cuda_check(env, cuda_status);
+  JNI_CUDA_CHECK(env, cuda_status);
 }
 
 } // namespace jni

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
@@ -328,10 +328,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
   try {
     // make sure the CUDA device is setup in the context
     cudaError_t cuda_status = cudaFree(0);
-    cudf::jni::jni_cuda_check(env, cuda_status);
+    JNI_CUDA_CHECK(env, cuda_status);
     int device_id;
     cuda_status = cudaGetDevice(&device_id);
-    cudf::jni::jni_cuda_check(env, cuda_status);
+    JNI_CUDA_CHECK(env, cuda_status);
 
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;
 
 public class CudaTest {
 
@@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Test
+  public void testCudaException() {
+    assertThrows(CudaException.class, () -> {
+          try {
+            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
+          } catch (CudaException ex) {
+            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
+            assertFalse(ex.cudaError.isSticky());
+            throw ex;
+          }
+        }
+    );
+  }
 }