rapidsai · rapids-bot · Jun 7, 2022 · May 18, 2022 · May 18, 2022 · May 19, 2022
@@ -114,9 +114,8 @@ namespace detail {
 
 inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
 {
-  // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
+  // Calls cudaGetLastError again. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
-  cudaGetLastError();
   auto const last = cudaGetLastError();
 __global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } 
 __global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } 
 TEST(DebugAssertDeathTest, cudf_assert_false) 
 { 
   testing::FLAGS_gtest_death_test_style = "threadsafe"; 
   auto call_kernel = []() { 
     assert_false_kernel<<<1, 1>>>(); 
     // Kernel should fail with `cudaErrorAssert` 
     // This error invalidates the current device context, so we need to kill 
     // the current process. Running with EXPECT_DEATH spawns a new process for 
     // each attempted kernel launch 
     if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); } 
     // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause 
     // EXPECT_DEATH to fail. 
   }; 
   EXPECT_DEATH(call_kernel(), "this kernel should die"); 
 } 
 __global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } 
  
 __global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } 
  
 TEST(DebugAssertDeathTest, cudf_assert_false) 
 { 
   testing::FLAGS_gtest_death_test_style = "threadsafe"; 
  
   auto call_kernel = []() { 
     assert_false_kernel<<<1, 1>>>(); 
  
     // Kernel should fail with `cudaErrorAssert` 
     // This error invalidates the current device context, so we need to kill 
     // the current process. Running with EXPECT_DEATH spawns a new process for 
     // each attempted kernel launch 
     if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); } 
  
     // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause 
     // EXPECT_DEATH to fail. 
   }; 
  
   EXPECT_DEATH(call_kernel(), "this kernel should die"); 
 } 
   auto const msg  = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
                                std::to_string(line) + ": " + std::to_string(error) + " " +

@@ -16,12 +16,13 @@
 
 #include <cudf_test/base_fixture.hpp>
 
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/filling.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream.hpp>
 
-#include <cstring>
-
 TEST(ExpectsTest, FalseCondition)
 {
   EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error);
@@ -118,11 +119,23 @@ TEST(DebugAssert, cudf_assert_true)
 
 #endif
 
+TEST(FatalCase, CudaFatalError)
+{
+  auto type = cudf::data_type{cudf::type_id::INT32};
+  auto cv   = cudf::column_view(type, 256, (void*)256);
+  cudf::binary_operation(cv, cv, cudf::binary_operator::ADD, type);
+  EXPECT_THROW(CUDF_CUDA_TRY(cudaDeviceSynchronize()), cudf::fatal_cuda_error);
+}
+
 // These tests don't use CUDF_TEST_PROGRAM_MAIN because :
 // 1.) They don't need the RMM Pool
 // 2.) The RMM Pool interferes with the death test
+// 3.) The order of test cases matters
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
+  ::testing::GTEST_FLAG(filter) = "-FatalCase.*";
+  int ret                       = RUN_ALL_TESTS();
+  ::testing::GTEST_FLAG(filter) = "FatalCase.*";
+  return ret + RUN_ALL_TESTS();
 }
@@ -136,7 +136,7 @@
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-vector</artifactId>
             <version>${arrow.version}</version>
-           <scope>test</scope>
+            <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.apache.parquet</groupId>
@@ -198,8 +198,30 @@
                         <configuration>
                             <excludes>
                                 <exclude>**/CuFileTest.java</exclude>
+                                <exclude>**/CudaTest.java</exclude>
                             </excludes>
                         </configuration>
+                        <executions>
+                            <execution>
+                                <id>main-tests</id>
+                                <goals>
+                                    <goal>test</goal>
+                                </goals>
+                            </execution>
+                            <execution>
+                                <id>fatal-cuda-test</id>
+                                <goals>
+                                    <goal>test</goal>
+                                </goals>
+                                <configuration>
+                                    <includes>
+                                        <include>**/CudaTest.java</include>
+                                    </includes>
+                                    <reuseForks>false</reuseForks>
+                                    <test>*/CudaTest.java</test>
+                                </configuration>
+                            </execution>
+                        </executions>
                     </plugin>
                 </plugins>
             </build>
@@ -279,7 +301,7 @@
                             <nexusUrl>https://oss.sonatype.org/</nexusUrl>
                             <autoReleaseAfterClose>false</autoReleaseAfterClose>
                         </configuration>
-                        </plugin>
+                    </plugin>
                 </plugins>
             </build>
         </profile>
@@ -288,16 +310,16 @@
     <build>
         <resources>
             <resource>
-              <!-- Include the properties file to provide the build information. -->
-              <directory>${project.build.directory}/extra-resources</directory>
-              <filtering>true</filtering>
+                <!-- Include the properties file to provide the build information. -->
+                <directory>${project.build.directory}/extra-resources</directory>
+                <filtering>true</filtering>
             </resource>
             <resource>
-              <directory>${basedir}/..</directory>
-              <targetPath>META-INF</targetPath>
-              <includes>
-                <include>LICENSE</include>
-              </includes>
+                <directory>${basedir}/..</directory>
+                <targetPath>META-INF</targetPath>
+                <includes>
+                    <include>LICENSE</include>
+                </includes>
             </resource>
         </resources>
         <pluginManagement>
@@ -338,6 +360,12 @@
                             <artifactId>junit-jupiter-engine</artifactId>
                             <version>5.4.2</version>
                         </dependency>
+                        <dependency>
+                            <!-- to get around bug https://github.com/junit-team/junit5/issues/1367 -->
+                            <groupId>org.apache.maven.surefire</groupId>
+                            <artifactId>surefire-logger-api</artifactId>
+                            <version>2.21.0</version>
+                        </dependency>
                     </dependencies>
                 </plugin>
                 <plugin>
@@ -384,9 +412,9 @@
                                       executable="cmake">
                                     <arg value="${basedir}/src/main/native"/>
                                     <arg line="${cmake.ccache.opts}"/>
-                                    <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
-                                    <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
-                                    <arg value="-DUSE_GDS=${USE_GDS}" />
+                                    <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}"/>
+                                    <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}"/>
+                                    <arg value="-DUSE_GDS=${USE_GDS}"/>
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
                                     <arg value="-DCMAKE_EXPORT_COMPILE_COMMANDS=${CMAKE_EXPORT_COMPILE_COMMANDS}"/>
                                     <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
@@ -403,9 +431,10 @@
                                     <arg value="${parallel.level}"/>
                                 </exec>
                                 <mkdir dir="${project.build.directory}/extra-resources"/>
-                                <exec executable="bash" output="${project.build.directory}/extra-resources/cudf-java-version-info.properties">
-                                  <arg value="${project.basedir}/buildscripts/build-info"/>
-                                  <arg value="${project.version}"/>
+                                <exec executable="bash"
+                                      output="${project.build.directory}/extra-resources/cudf-java-version-info.properties">
+                                    <arg value="${project.basedir}/buildscripts/build-info"/>
+                                    <arg value="${project.version}"/>
                                 </exec>
                             </tasks>
                         </configuration>
@@ -427,31 +456,31 @@
                         </goals>
                         <configuration>
                             <source>
-                            def sout = new StringBuffer(), serr = new StringBuffer()
-                            //This only works on linux
-                            def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute()
-                            proc.consumeProcessOutput(sout, serr)
-                            proc.waitForOrKill(10000)
-                            def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/
-                            def cudfm = libcudf.matcher(sout)
-                            if (cudfm.find()) {
-                                pom.properties['native.cudf.path'] = cudfm.group(1)
-                            } else {
-                                fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr")
-                            }
+                                def sout = new StringBuffer(), serr = new StringBuffer()
+                                //This only works on linux
+                                def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute()
+                                proc.consumeProcessOutput(sout, serr)
+                                proc.waitForOrKill(10000)
+                                def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/
+                                def cudfm = libcudf.matcher(sout)
+                                if (cudfm.find()) {
+                                    pom.properties['native.cudf.path'] = cudfm.group(1)
+                                } else {
+                                    fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr")
+                                }
 
-                            def nvccout = new StringBuffer(), nvccerr = new StringBuffer()
-                            def nvccproc = 'nvcc --version'.execute()
-                            nvccproc.consumeProcessOutput(nvccout, nvccerr)
-                            nvccproc.waitForOrKill(10000)
-                            def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/
-                            def cm = cudaPattern.matcher(nvccout)
-                            if (cm.find()) {
-                                def classifier = 'cuda' + cm.group(1)
-                                pom.properties['cuda.classifier'] = classifier
-                            } else {
-                                fail('could not find CUDA version')
-                            }
+                                def nvccout = new StringBuffer(), nvccerr = new StringBuffer()
+                                def nvccproc = 'nvcc --version'.execute()
+                                nvccproc.consumeProcessOutput(nvccout, nvccerr)
+                                nvccproc.waitForOrKill(10000)
+                                def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/
+                                def cm = cudaPattern.matcher(nvccout)
+                                if (cm.find()) {
+                                    def classifier = 'cuda' + cm.group(1)
+                                    pom.properties['cuda.classifier'] = classifier
+                                } else {
+                                    fail('could not find CUDA version')
+                                }
                             </source>
                         </configuration>
                     </execution>
@@ -479,13 +508,13 @@
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
                 <configuration>
-                  <!-- you can turn this off, by passing -DtrimStackTrace=true when running tests -->
-                  <trimStackTrace>false</trimStackTrace>
-                  <redirectTestOutputToFile>true</redirectTestOutputToFile>
-                  <systemPropertyVariables>
-                    <ai.rapids.refcount.debug>${ai.rapids.refcount.debug}</ai.rapids.refcount.debug>
-                    <ai.rapids.cudf.nvtx.enabled>${ai.rapids.cudf.nvtx.enabled}</ai.rapids.cudf.nvtx.enabled>
-                  </systemPropertyVariables>
+                    <!-- you can turn this off, by passing -DtrimStackTrace=true when running tests -->
+                    <trimStackTrace>false</trimStackTrace>
+                    <redirectTestOutputToFile>true</redirectTestOutputToFile>
+                    <systemPropertyVariables>
+                        <ai.rapids.refcount.debug>${ai.rapids.refcount.debug}</ai.rapids.refcount.debug>
+                        <ai.rapids.cudf.nvtx.enabled>${ai.rapids.cudf.nvtx.enabled}</ai.rapids.cudf.nvtx.enabled>
+                    </systemPropertyVariables>
                 </configuration>
             </plugin>
             <plugin>

@@ -807,14 +807,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val)                                          \
+#define JNI_CHECK_CUDA_ERROR(env, class_name, msg, e_code, ret_val)                                \
   {                                                                                                \
     if (env->ExceptionOccurred()) {                                                                \
       return ret_val;                                                                              \
     }                                                                                              \
-    std::string n_msg = e.what() == nullptr ? "" : e.what();                                       \
-    jstring j_msg = env->NewStringUTF(n_msg.c_str());                                              \
-    jint e_code = static_cast<jint>(e.error_code());                                               \
+    jstring j_msg = env->NewStringUTF(msg);                                                        \
     jclass ex_class = env->FindClass(class_name);                                                  \
     if (ex_class != NULL) {                                                                        \
       jmethodID ctor_id = env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;I)V");          \
@@ -856,12 +854,25 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
   }                                                                                                \
   catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val);                      \
+    const char *what = e.what() == nullptr ? "" : e.what();                                        \
+    auto e_code = static_cast<jint>(e.error_code());                                               \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, what, e_code, ret_val);           \
   }                                                                                                \
   catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val);                            \
+    const char *what = e.what() == nullptr ? "" : e.what();                                        \
+    auto e_code = static_cast<jint>(e.error_code());                                               \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, what, e_code, ret_val);                 \
   }                                                                                                \
   catch (const std::exception &e) {                                                                \
+    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
+    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
+    /* occurred if the second call doesn't return with cudaSuccess. */                             \
+    auto const last = cudaDeviceSynchronize();                                                     \
+    if (cudaSuccess != last && last == cudaGetLastError()) {                                       \
+      const char *what = e.what() == nullptr ? "" : e.what();                                      \
+      auto code = static_cast<jint>(last);                                                         \
+      JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, what, code, ret_val);           \
+    }                                                                                              \
     /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
     JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \
   }

@@ -16,13 +16,15 @@
 
 package ai.rapids.cudf;
 
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 
 import static org.junit.jupiter.api.Assertions.*;
 
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
 public class CudaTest {
 
   @Test
+  @Order(1)
   public void testGetCudaRuntimeInfo() {
     // The driver version is not necessarily larger than runtime version. Drivers of previous
     // version are also able to support runtime of later version, only if they support same
@@ -33,6 +35,7 @@ public void testGetCudaRuntimeInfo() {
   }
 
   @Test
+  @Order(2)
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
@@ -44,5 +47,53 @@ public void testCudaException() {
           }
         }
     );
+    // non-fatal CUDA error will not fail subsequent CUDA calls
+    try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
+    }
   }
+
+  @Test
+  @Order(3)
+  public void testCudaFatalException() {
+    try (ColumnView cv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256);
+         ColumnView ret = cv.sub(cv);
+         HostColumnVector hcv = ret.copyToHost()) {
+    } catch (CudaException ignored) {
+    }
+
+    // CUDA API invoked by libcudf failed because of previous unrecoverable fatal error
+    assertThrows(CudaFatalException.class, () -> {
+      try (ColumnView cv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256);
+           HostColumnVector hcv = cv.copyToHost()) {
+      } catch (CudaFatalException ex) {
+        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+        throw ex;
+      }
+    });
+  }
+
+  @Test
+  @Order(4)
+  public void testCudaFatalExceptionFromRMM() {
+    // CUDA API invoked by RMM failed because of previous unrecoverable fatal error
+    assertThrows(CudaFatalException.class, () -> {
+      try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
+      } catch (CudaFatalException ex) {
+        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+        throw ex;
+      }
+    });
+  }
+
+  private static class BadDeviceBuffer extends BaseDeviceMemoryBuffer {
+    public BadDeviceBuffer() {
+      super(256L, 256L, (MemoryBufferCleaner) null);
+    }
+
+    @Override
+    public MemoryBuffer slice(long offset, long len) {
+      return null;
+    }
+  }
+
 }