rapidsai · rapids-bot · Jun 7, 2022 · May 18, 2022 · May 18, 2022 · May 19, 2022
@@ -114,9 +114,8 @@ namespace detail {
 
 inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
 {
-  // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
+  // Calls cudaGetLastError again. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
-  cudaGetLastError();
   auto const last = cudaGetLastError();
 __global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } 
 __global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } 
 TEST(DebugAssertDeathTest, cudf_assert_false) 
 { 
   testing::FLAGS_gtest_death_test_style = "threadsafe"; 
   auto call_kernel = []() { 
     assert_false_kernel<<<1, 1>>>(); 
     // Kernel should fail with `cudaErrorAssert` 
     // This error invalidates the current device context, so we need to kill 
     // the current process. Running with EXPECT_DEATH spawns a new process for 
     // each attempted kernel launch 
     if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); } 
     // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause 
     // EXPECT_DEATH to fail. 
   }; 
   EXPECT_DEATH(call_kernel(), "this kernel should die"); 
 } 
 __global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } 
  
 __global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } 
  
 TEST(DebugAssertDeathTest, cudf_assert_false) 
 { 
   testing::FLAGS_gtest_death_test_style = "threadsafe"; 
  
   auto call_kernel = []() { 
     assert_false_kernel<<<1, 1>>>(); 
  
     // Kernel should fail with `cudaErrorAssert` 
     // This error invalidates the current device context, so we need to kill 
     // the current process. Running with EXPECT_DEATH spawns a new process for 
     // each attempted kernel launch 
     if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); } 
  
     // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause 
     // EXPECT_DEATH to fail. 
   }; 
  
   EXPECT_DEATH(call_kernel(), "this kernel should die"); 
 } 
   auto const msg  = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
                                std::to_string(line) + ": " + std::to_string(error) + " " +

@@ -16,12 +16,13 @@
 
 #include <cudf_test/base_fixture.hpp>
 
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/filling.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream.hpp>
 
-#include <cstring>
-
 TEST(ExpectsTest, FalseCondition)
 {
   EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error);
@@ -118,11 +119,23 @@ TEST(DebugAssert, cudf_assert_true)
 
 #endif
 
+TEST(FatalCase, CudaFatalError)
+{
+  auto type = cudf::data_type{cudf::type_id::INT32};
+  auto cv   = cudf::column_view(type, 256, (void*)256);
+  cudf::binary_operation(cv, cv, cudf::binary_operator::ADD, type);
+  EXPECT_THROW(CUDF_CUDA_TRY(cudaDeviceSynchronize()), cudf::fatal_cuda_error);
+}
+
 // These tests don't use CUDF_TEST_PROGRAM_MAIN because :
 // 1.) They don't need the RMM Pool
 // 2.) The RMM Pool interferes with the death test
+// 3.) The order of test cases matters
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
+  ::testing::GTEST_FLAG(filter) = "-FatalCase.*";
+  int ret                       = RUN_ALL_TESTS();
+  ::testing::GTEST_FLAG(filter) = "FatalCase.*";
+  return ret + RUN_ALL_TESTS();
 }
@@ -199,6 +199,7 @@
                             <excludes>
                                 <exclude>**/CuFileTest.java</exclude>
                             </excludes>
+		            <runOrder>alphabetical</runOrder>		    
                         </configuration>
                     </plugin>
                 </plugins>

@@ -807,14 +807,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val)                                          \
+#define JNI_CHECK_CUDA_ERROR(env, class_name, msg, e_code, ret_val)                                \
   {                                                                                                \
     if (env->ExceptionOccurred()) {                                                                \
       return ret_val;                                                                              \
     }                                                                                              \
-    std::string n_msg = e.what() == nullptr ? "" : e.what();                                       \
-    jstring j_msg = env->NewStringUTF(n_msg.c_str());                                              \
-    jint e_code = static_cast<jint>(e.error_code());                                               \
+    jstring j_msg = env->NewStringUTF(msg);                                                        \
     jclass ex_class = env->FindClass(class_name);                                                  \
     if (ex_class != NULL) {                                                                        \
       jmethodID ctor_id = env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;I)V");          \
@@ -856,12 +854,25 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
   }                                                                                                \
   catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val);                      \
+    const char *what = e.what() == nullptr ? "" : e.what();                                        \
+    auto e_code = static_cast<jint>(e.error_code());                                               \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, what, e_code, ret_val);           \
   }                                                                                                \
   catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val);                            \
+    const char *what = e.what() == nullptr ? "" : e.what();                                        \
+    auto e_code = static_cast<jint>(e.error_code());                                               \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, what, e_code, ret_val);                 \
   }                                                                                                \
   catch (const std::exception &e) {                                                                \
+    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
+    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
+    /* occurred if the second call doesn't return with cudaSuccess. */                             \
+    auto const last = cudaDeviceSynchronize();                                                     \
+    if (cudaSuccess != last && last == cudaGetLastError()) {                                       \
+      const char *what = e.what() == nullptr ? "" : e.what();                                      \
+      auto code = static_cast<jint>(last);                                                         \
+      JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, what, code, ret_val);           \
+    }                                                                                              \
     /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
     JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \
   }

diff --git a/java/src/test/java/ai/rapids/cudf/zzzCudaTest.java b/java/src/test/java/ai/rapids/cudf/zzzCudaTest.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+// Add prefix zzz to ensure this test suite being executed behind all other tests. Because this
+// suite will cause a fatal cuda error which disables the whole device.
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+public class zzzCudaTest {
+
+  @Test
+  @Order(1)
+  public void testGetCudaRuntimeInfo() {
+    // The driver version is not necessarily larger than runtime version. Drivers of previous
+    // version are also able to support runtime of later version, only if they support same
+    // kinds of computeModes.
+    assert Cuda.getDriverVersion() >= 1000;
+    assert Cuda.getRuntimeVersion() >= 1000;
+    assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
+  }
+
+  @Test
+  @Order(2)
+  public void testCudaException() {
+    assertThrows(CudaException.class, () -> {
+          try {
+            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
+          } catch (CudaFatalException ignored) {
+          } catch (CudaException ex) {
+            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
+            throw ex;
+          }
+        }
+    );
+    // non-fatal CUDA error will not fail subsequent CUDA calls
+    try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
+    }
+  }
+
+  @Test
+  @Order(3)
+  public void testCudaFatalException() {
+    try (ColumnView cv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256);
+         ColumnView ret = cv.sub(cv);
+         HostColumnVector hcv = ret.copyToHost()) {
+    } catch (CudaException ignored) {
+    }
+
+    // CUDA API invoked by libcudf failed because of previous unrecoverable fatal error
+    assertThrows(CudaFatalException.class, () -> {
+      try (ColumnView cv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256);
+           HostColumnVector hcv = cv.copyToHost()) {
+      } catch (CudaFatalException ex) {
+        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+        throw ex;
+      }
+    });
+  }
+
+  @Test
+  @Order(4)
+  public void testCudaFatalExceptionFromRMM() {
+    // CUDA API invoked by RMM failed because of previous unrecoverable fatal error
+    assertThrows(CudaFatalException.class, () -> {
+      try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
+      } catch (CudaFatalException ex) {
+        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+        throw ex;
+      }
+    });
+  }
+
+  private static class BadDeviceBuffer extends BaseDeviceMemoryBuffer {
+    public BadDeviceBuffer() {
+      super(256L, 256L, (MemoryBufferCleaner) null);
+    }
+
+    @Override
+    public MemoryBuffer slice(long offset, long len) {
+      return null;
+    }
+  }
+
+}