From 4dfd68464704f9abee9d0cbb0bbbcb421ef46250 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Tue, 7 Jun 2022 23:26:40 +0800 Subject: [PATCH] Improve the capture of fatal cuda error (#10884) This PR is a follow-up PR of #10630, which is to improve the capture of fatal cuda errors in libcudf and cudf java package. 1. libcudf: Removes the redundent call of `cudaGetLastError` in throw_cuda_error, since the call returning the cuda error can be deemed as the first call. 2. JNI: Leverages similar logic to discern fatal cuda errors from catched exceptions. The check at the JNI level is necessary because fatal cuda errors due to rmm APIs can not be distinguished. 3. Add C++ unit test for the capture of fatal cuda error 4. Add Java unit test for the capture of fatal cuda error Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/10884 --- cpp/include/cudf/utilities/error.hpp | 6 +- cpp/tests/error/error_handling_test.cu | 22 ++- java/pom.xml | 153 +++++++++++++----- java/src/main/native/include/jni_utils.hpp | 10 ++ .../java/ai/rapids/cudf/CudaFatalTest.java | 67 ++++++++ .../test/java/ai/rapids/cudf/CudaTest.java | 7 +- 6 files changed, 215 insertions(+), 50 deletions(-) create mode 100644 java/src/test/java/ai/rapids/cudf/CudaFatalTest.java diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 412cefba507..62f331c4679 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -136,10 +136,10 @@ namespace detail { // @cond inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) { - // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second - // call doesn't return with cudaSuccess. + // Calls cudaGetLastError to clear the error status. It is nearly certain that a fatal error + // occurred if it still returns the same error after a cleanup. cudaGetLastError(); - auto const last = cudaGetLastError(); + auto const last = cudaFree(0); auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index bde8ccc6de7..9d8e0a7fe64 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -16,12 +16,11 @@ #include +#include #include #include -#include - TEST(ExpectsTest, FalseCondition) { EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error); @@ -84,6 +83,25 @@ TEST(StreamCheck, CatchFailedKernel) "invalid configuration argument"); } +__global__ void kernel(int* p) { *p = 42; } + +TEST(DeathTest, CudaFatalError) +{ + testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto call_kernel = []() { + int* p; + cudaMalloc(&p, 2 * sizeof(int)); + int* misaligned = (int*)(reinterpret_cast(p) + 1); + kernel<<<1, 1>>>(misaligned); + try { + CUDF_CUDA_TRY(cudaDeviceSynchronize()); + } catch (const cudf::fatal_cuda_error& fe) { + std::abort(); + } + }; + ASSERT_DEATH(call_kernel(), ""); +} + #ifndef NDEBUG __global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } diff --git a/java/pom.xml b/java/pom.xml index ad22b6d25d0..9402b292cfa 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -136,7 +136,7 @@ org.apache.arrow arrow-vector ${arrow.version} - test + test org.apache.parquet @@ -184,6 +184,42 @@ -Wno-deprecated-declarations + + default-tests + + + + maven-surefire-plugin + + + **/CudaFatalTest.java + + + + + main-tests + + test + + + + fatal-cuda-test + + test + + + + **/CudaFatalTest.java + + false + */CudaFatalTest.java + + + + + + + no-cufile-tests @@ -199,8 +235,30 @@ **/CuFileTest.java + **/CudaFatalTest.java + + + main-tests + + test + + + + fatal-cuda-test + + test + + + + **/CudaFatalTest.java + + false + */CudaFatalTest.java + + + @@ -280,7 +338,7 @@ https://oss.sonatype.org/ false - + @@ -289,16 +347,16 @@ - - ${project.build.directory}/extra-resources - true + + ${project.build.directory}/extra-resources + true - ${basedir}/.. - META-INF - - LICENSE - + ${basedir}/.. + META-INF + + LICENSE + @@ -339,6 +397,12 @@ junit-jupiter-engine 5.4.2 + + + org.apache.maven.surefire + surefire-logger-api + 2.21.0 + @@ -404,9 +468,10 @@ - - - + + + @@ -428,31 +493,31 @@ - def sout = new StringBuffer(), serr = new StringBuffer() - //This only works on linux - def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute() - proc.consumeProcessOutput(sout, serr) - proc.waitForOrKill(10000) - def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/ - def cudfm = libcudf.matcher(sout) - if (cudfm.find()) { - pom.properties['native.cudf.path'] = cudfm.group(1) - } else { - fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr") - } + def sout = new StringBuffer(), serr = new StringBuffer() + //This only works on linux + def proc = 'ldd ${native.build.path}/libcudfjni.so'.execute() + proc.consumeProcessOutput(sout, serr) + proc.waitForOrKill(10000) + def libcudf = ~/libcudf.*\\.so\\s+=>\\s+(.*)libcudf.*\\.so\\s+.*/ + def cudfm = libcudf.matcher(sout) + if (cudfm.find()) { + pom.properties['native.cudf.path'] = cudfm.group(1) + } else { + fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr") + } - def nvccout = new StringBuffer(), nvccerr = new StringBuffer() - def nvccproc = 'nvcc --version'.execute() - nvccproc.consumeProcessOutput(nvccout, nvccerr) - nvccproc.waitForOrKill(10000) - def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/ - def cm = cudaPattern.matcher(nvccout) - if (cm.find()) { - def classifier = 'cuda' + cm.group(1) - pom.properties['cuda.classifier'] = classifier - } else { - fail('could not find CUDA version') - } + def nvccout = new StringBuffer(), nvccerr = new StringBuffer() + def nvccproc = 'nvcc --version'.execute() + nvccproc.consumeProcessOutput(nvccout, nvccerr) + nvccproc.waitForOrKill(10000) + def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/ + def cm = cudaPattern.matcher(nvccout) + if (cm.find()) { + def classifier = 'cuda' + cm.group(1) + pom.properties['cuda.classifier'] = classifier + } else { + fail('could not find CUDA version') + } @@ -480,13 +545,13 @@ org.apache.maven.plugins maven-surefire-plugin - - false - true - - ${ai.rapids.refcount.debug} - ${ai.rapids.cudf.nvtx.enabled} - + + false + true + + ${ai.rapids.refcount.debug} + ${ai.rapids.cudf.nvtx.enabled} + diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index eca424132a5..78239b86ae2 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -862,6 +862,16 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ } \ catch (const std::exception &e) { \ + /* Double check whether the thrown exception is unrecoverable CUDA error or not. */ \ + /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error */ \ + /* occurred if the second call doesn't return with cudaSuccess. */ \ + cudaGetLastError(); \ + auto const last = cudaFree(0); \ + if (cudaSuccess != last && last == cudaDeviceSynchronize()) { \ + auto msg = e.what() == nullptr ? std::string{""} : e.what(); \ + auto cuda_error = cudf::fatal_cuda_error{msg, last}; \ + JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, cuda_error, ret_val); \ + } \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ } diff --git a/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java b/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java new file mode 100644 index 00000000000..ef55ff84b68 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class CudaFatalTest { + + @Test + public void testCudaFatalException() { + try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3, 4, 5)) { + + try (ColumnView badCv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256); + ColumnView ret = badCv.sub(badCv); + HostColumnVector hcv = ret.copyToHost()) { + } catch (CudaException ignored) { + } + + // CUDA API invoked by libcudf failed because of previous unrecoverable fatal error + assertThrows(CudaFatalException.class, () -> { + try (ColumnVector cv2 = cv.asLongs()) { + } catch (CudaFatalException ex) { + assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); + throw ex; + } + }); + } + + // CUDA API invoked by RMM failed because of previous unrecoverable fatal error + assertThrows(CudaFatalException.class, () -> { + try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) { + } catch (CudaFatalException ex) { + assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); + throw ex; + } + }); + } + + private static class BadDeviceBuffer extends BaseDeviceMemoryBuffer { + public BadDeviceBuffer() { + super(256L, 256L, (MemoryBufferCleaner) null); + } + + @Override + public MemoryBuffer slice(long offset, long len) { + return null; + } + } + +} diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 1a86dbb374d..c20f2435258 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -18,7 +18,8 @@ import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; public class CudaTest { @@ -44,5 +45,9 @@ public void testCudaException() { } } ); + // non-fatal CUDA error will not fail subsequent CUDA calls + try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) { + } } + }