-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve the capture of fatal cuda error (#10884)
This PR is a follow-up PR of #10630, which is to improve the capture of fatal cuda errors in libcudf and cudf java package. 1. libcudf: Removes the redundent call of `cudaGetLastError` in throw_cuda_error, since the call returning the cuda error can be deemed as the first call. 2. JNI: Leverages similar logic to discern fatal cuda errors from catched exceptions. The check at the JNI level is necessary because fatal cuda errors due to rmm APIs can not be distinguished. 3. Add C++ unit test for the capture of fatal cuda error 4. Add Java unit test for the capture of fatal cuda error Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Jason Lowe (https://github.com/jlowe) URL: #10884
- Loading branch information
1 parent
4d138ef
commit 4dfd684
Showing
6 changed files
with
215 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package ai.rapids.cudf; | ||
|
||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertThrows; | ||
|
||
public class CudaFatalTest { | ||
|
||
@Test | ||
public void testCudaFatalException() { | ||
try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3, 4, 5)) { | ||
|
||
try (ColumnView badCv = ColumnView.fromDeviceBuffer(new BadDeviceBuffer(), 0, DType.INT8, 256); | ||
ColumnView ret = badCv.sub(badCv); | ||
HostColumnVector hcv = ret.copyToHost()) { | ||
} catch (CudaException ignored) { | ||
} | ||
|
||
// CUDA API invoked by libcudf failed because of previous unrecoverable fatal error | ||
assertThrows(CudaFatalException.class, () -> { | ||
try (ColumnVector cv2 = cv.asLongs()) { | ||
} catch (CudaFatalException ex) { | ||
assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); | ||
throw ex; | ||
} | ||
}); | ||
} | ||
|
||
// CUDA API invoked by RMM failed because of previous unrecoverable fatal error | ||
assertThrows(CudaFatalException.class, () -> { | ||
try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) { | ||
} catch (CudaFatalException ex) { | ||
assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError); | ||
throw ex; | ||
} | ||
}); | ||
} | ||
|
||
private static class BadDeviceBuffer extends BaseDeviceMemoryBuffer { | ||
public BadDeviceBuffer() { | ||
super(256L, 256L, (MemoryBufferCleaner) null); | ||
} | ||
|
||
@Override | ||
public MemoryBuffer slice(long offset, long len) { | ||
return null; | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters