Skip to content

Commit

Permalink
Improve the capture of fatal cuda error (#10884)
Browse files Browse the repository at this point in the history
This PR is a follow-up PR of #10630, which is to improve the capture of fatal cuda errors in libcudf and cudf java package.

1. libcudf: Removes the redundent call of `cudaGetLastError` in throw_cuda_error, since the call returning the cuda error can be deemed as the first call.
2. JNI: Leverages similar logic to discern fatal cuda errors from catched exceptions. The check at the JNI level is necessary because fatal cuda errors due to rmm APIs can not be distinguished.
3. Add C++ unit test for the capture of fatal cuda error 
4. Add Java unit test for the capture of fatal cuda error

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Jason Lowe (https://github.com/jlowe)

URL: #10884
  • Loading branch information
sperlingxx authored Jun 7, 2022
1 parent 4d138ef commit 4dfd684
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 50 deletions.
6 changes: 3 additions & 3 deletions cpp/include/cudf/utilities/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,10 @@ namespace detail {
// @cond
inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
{
// Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
// call doesn't return with cudaSuccess.
// Calls cudaGetLastError to clear the error status. It is nearly certain that a fatal error
// occurred if it still returns the same error after a cleanup.
cudaGetLastError();
auto const last = cudaGetLastError();
auto const last = cudaFree(0);
auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
std::to_string(line) + ": " + std::to_string(error) + " " +
cudaGetErrorName(error) + " " + cudaGetErrorString(error)};
Expand Down
22 changes: 20 additions & 2 deletions cpp/tests/error/error_handling_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

#include <cudf_test/base_fixture.hpp>

#include <cudf/filling.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream.hpp>

#include <cstring>

TEST(ExpectsTest, FalseCondition)
{
EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error);
Expand Down Expand Up @@ -84,6 +83,25 @@ TEST(StreamCheck, CatchFailedKernel)
"invalid configuration argument");
}

__global__ void kernel(int* p) { *p = 42; }

TEST(DeathTest, CudaFatalError)
{
testing::FLAGS_gtest_death_test_style = "threadsafe";
auto call_kernel = []() {
int* p;
cudaMalloc(&p, 2 * sizeof(int));
int* misaligned = (int*)(reinterpret_cast<char*>(p) + 1);
kernel<<<1, 1>>>(misaligned);
try {
CUDF_CUDA_TRY(cudaDeviceSynchronize());
} catch (const cudf::fatal_cuda_error& fe) {
std::abort();
}
};
ASSERT_DEATH(call_kernel(), "");
}

#ifndef NDEBUG

__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
Expand Down
153 changes: 109 additions & 44 deletions java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
<version>${arrow.version}</version>
<scope>test</scope>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
Expand Down Expand Up @@ -184,6 +184,42 @@
<cxx.flags>-Wno-deprecated-declarations</cxx.flags>
</properties>
</profile>
<profile>
<id>default-tests</id>
<build>
<plugins>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<excludes>
<exclude>**/CudaFatalTest.java</exclude>
</excludes>
</configuration>
<executions>
<execution>
<id>main-tests</id>
<goals>
<goal>test</goal>
</goals>
</execution>
<execution>
<id>fatal-cuda-test</id>
<goals>
<goal>test</goal>
</goals>
<configuration>
<includes>
<include>**/CudaFatalTest.java</include>
</includes>
<reuseForks>false</reuseForks>
<test>*/CudaFatalTest.java</test>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>no-cufile-tests</id>
<activation>
Expand All @@ -199,8 +235,30 @@
<configuration>
<excludes>
<exclude>**/CuFileTest.java</exclude>
<exclude>**/CudaFatalTest.java</exclude>
</excludes>
</configuration>
<executions>
<execution>
<id>main-tests</id>
<goals>
<goal>test</goal>
</goals>
</execution>
<execution>
<id>fatal-cuda-test</id>
<goals>
<goal>test</goal>
</goals>
<configuration>
<includes>
<include>**/CudaFatalTest.java</include>
</includes>
<reuseForks>false</reuseForks>
<test>*/CudaFatalTest.java</test>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
Expand Down Expand Up @@ -280,7 +338,7 @@
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
</plugin>
</plugins>
</build>
</profile>
Expand All @@ -289,16 +347,16 @@
<build>
<resources>
<resource>
<!-- Include the properties file to provide the build information. -->
<directory>${project.build.directory}/extra-resources</directory>
<filtering>true</filtering>
<!-- Include the properties file to provide the build information. -->
<directory>${project.build.directory}/extra-resources</directory>
<filtering>true</filtering>
</resource>
<resource>
<directory>${basedir}/..</directory>
<targetPath>META-INF</targetPath>
<includes>
<include>LICENSE</include>
</includes>
<directory>${basedir}/..</directory>
<targetPath>META-INF</targetPath>
<includes>
<include>LICENSE</include>
</includes>
</resource>
</resources>
<pluginManagement>
Expand Down Expand Up @@ -339,6 +397,12 @@
<artifactId>junit-jupiter-engine</artifactId>
<version>5.4.2</version>
</dependency>
<dependency>
<!-- to get around bug https://github.com/junit-team/junit5/issues/1367 -->
<groupId>org.apache.maven.surefire</groupId>
<artifactId>surefire-logger-api</artifactId>
<version>2.21.0</version>
</dependency>
</dependencies>
</plugin>
<plugin>
Expand Down Expand Up @@ -404,9 +468,10 @@
<arg value="${parallel.level}"/>
</exec>
<mkdir dir="${project.build.directory}/extra-resources"/>
<exec executable="bash" output="${project.build.directory}/extra-resources/cudf-java-version-info.properties">
<arg value="${project.basedir}/buildscripts/build-info"/>
<arg value="${project.version}"/>
<exec executable="bash"
output="${project.build.directory}/extra-resources/cudf-java-version-info.properties">
<arg value="${project.basedir}/buildscripts/build-info"/>
<arg value="${project.version}"/>
</exec>
</tasks>
</configuration>
Expand All @@ -428,31 +493,31 @@
</goals>
<configuration>
<source>
def sout = new StringBuffer(), serr = new StringBuffer()
//This only works on linux
def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute()
proc.consumeProcessOutput(sout, serr)
proc.waitForOrKill(10000)
def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/
def cudfm = libcudf.matcher(sout)
if (cudfm.find()) {
pom.properties['native.cudf.path'] = cudfm.group(1)
} else {
fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr")
}
def sout = new StringBuffer(), serr = new StringBuffer()
//This only works on linux
def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute()
proc.consumeProcessOutput(sout, serr)
proc.waitForOrKill(10000)
def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/
def cudfm = libcudf.matcher(sout)
if (cudfm.find()) {
pom.properties['native.cudf.path'] = cudfm.group(1)
} else {
fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr")
}

def nvccout = new StringBuffer(), nvccerr = new StringBuffer()
def nvccproc = 'nvcc --version'.execute()
nvccproc.consumeProcessOutput(nvccout, nvccerr)
nvccproc.waitForOrKill(10000)
def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/
def cm = cudaPattern.matcher(nvccout)
if (cm.find()) {
def classifier = 'cuda' + cm.group(1)
pom.properties['cuda.classifier'] = classifier
} else {
fail('could not find CUDA version')
}
def nvccout = new StringBuffer(), nvccerr = new StringBuffer()
def nvccproc = 'nvcc --version'.execute()
nvccproc.consumeProcessOutput(nvccout, nvccerr)
nvccproc.waitForOrKill(10000)
def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/
def cm = cudaPattern.matcher(nvccout)
if (cm.find()) {
def classifier = 'cuda' + cm.group(1)
pom.properties['cuda.classifier'] = classifier
} else {
fail('could not find CUDA version')
}
</source>
</configuration>
</execution>
Expand Down Expand Up @@ -480,13 +545,13 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<!-- you can turn this off, by passing -DtrimStackTrace=true when running tests -->
<trimStackTrace>false</trimStackTrace>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
<systemPropertyVariables>
<ai.rapids.refcount.debug>${ai.rapids.refcount.debug}</ai.rapids.refcount.debug>
<ai.rapids.cudf.nvtx.enabled>${ai.rapids.cudf.nvtx.enabled}</ai.rapids.cudf.nvtx.enabled>
</systemPropertyVariables>
<!-- you can turn this off, by passing -DtrimStackTrace=true when running tests -->
<trimStackTrace>false</trimStackTrace>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
<systemPropertyVariables>
<ai.rapids.refcount.debug>${ai.rapids.refcount.debug}</ai.rapids.refcount.debug>
<ai.rapids.cudf.nvtx.enabled>${ai.rapids.cudf.nvtx.enabled}</ai.rapids.cudf.nvtx.enabled>
</systemPropertyVariables>
</configuration>
</plugin>
<plugin>
Expand Down
10 changes: 10 additions & 0 deletions java/src/main/native/include/jni_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,16 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \
} \
catch (const std::exception &e) { \
/* Double check whether the thrown exception is unrecoverable CUDA error or not. */ \
/* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error */ \
/* occurred if the second call doesn't return with cudaSuccess. */ \
cudaGetLastError(); \
auto const last = cudaFree(0); \
if (cudaSuccess != last && last == cudaDeviceSynchronize()) { \
auto msg = e.what() == nullptr ? std::string{""} : e.what(); \
auto cuda_error = cudf::fatal_cuda_error{msg, last}; \
JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, cuda_error, ret_val); \
} \
/* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \
}
Expand Down
67 changes: 67 additions & 0 deletions java/src/test/java/ai/rapids/cudf/CudaFatalTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.rapids.cudf;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

public class CudaFatalTest {

@Test
public void testCudaFatalException() {
try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3, 4, 5)) {

try (ColumnView badCv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256);
ColumnView ret = badCv.sub(badCv);
HostColumnVector hcv = ret.copyToHost()) {
} catch (CudaException ignored) {
}

// CUDA API invoked by libcudf failed because of previous unrecoverable fatal error
assertThrows(CudaFatalException.class, () -> {
try (ColumnVector cv2 = cv.asLongs()) {
} catch (CudaFatalException ex) {
assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
throw ex;
}
});
}

// CUDA API invoked by RMM failed because of previous unrecoverable fatal error
assertThrows(CudaFatalException.class, () -> {
try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
} catch (CudaFatalException ex) {
assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
throw ex;
}
});
}

private static class BadDeviceBuffer extends BaseDeviceMemoryBuffer {
public BadDeviceBuffer() {
super(256L, 256L, (MemoryBufferCleaner) null);
}

@Override
public MemoryBuffer slice(long offset, long len) {
return null;
}
}

}
7 changes: 6 additions & 1 deletion java/src/test/java/ai/rapids/cudf/CudaTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

public class CudaTest {

Expand All @@ -44,5 +45,9 @@ public void testCudaException() {
}
}
);
// non-fatal CUDA error will not fail subsequent CUDA calls
try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
}
}

}

0 comments on commit 4dfd684

Please sign in to comment.