From 4a301ac9abdcce93aa2d341f2f3f37f271a2abb6 Mon Sep 17 00:00:00 2001 From: Meixu Song Date: Mon, 31 Aug 2020 20:19:07 +0800 Subject: [PATCH] update to review (#103) * update to review - fix a twice error code checking issue - change macro name `K2_CUDA_CHECK_ERROR` to `K2_CUDA_PRINT_ERROR` as its effect - add a optional arg `abort` to control abort if hit error - trivals fix * just revert name: `K2_CUDA_CHECK_ERROR` --- CMakeLists.txt | 65 +++++++++++++++++------------ k2/csrc/cuda/debug.h | 84 +++++++++++++++++++++++--------------- k2/csrc/cuda/debug_test.cu | 18 +++----- 3 files changed, 95 insertions(+), 72 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 331bd1ad7..8f1e748e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,33 +43,46 @@ if(WIN32 AND BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) endif() -# the following settings are modified from cub/CMakeLists.txt -#[[ start settings for CUB ]] - -set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.") -set(CMAKE_CXX_EXTENSIONS OFF) - -message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") - -# Force CUDA C++ standard to be the same as the C++ standard used. -# -# Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597 -# which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11 -# versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03). -# In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly. -if(DEFINED CMAKE_CUDA_STANDARD) - message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD" - " is used as the C++ standard version for both C++ and CUDA.") +if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") + find_package(CUDAToolkit REQUIRED) + if(CUDAToolkit_FOUND) + message(STATUS "found CUDAToolkit " ${CUDAToolkit_LIBRARY_DIR}) + message(STATUS "CUDAToolkit_INCLUDE_DIRS " ${CUDAToolkit_INCLUDE_DIRS}) + message(STATUS "CUDAToolkit_LIBRARY_DIR " ${CUDAToolkit_LIBRARY_DIR}) + + enable_language(CUDA) + + # With many architectures set here, the nvcc build time would be much longer. + # Thus, "61" is put here for speed and compatibility. + # @ToDo Need to cover more architectures, before release these code. + set(CMAKE_CUDA_ARCHITECTURES 61) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") + endif() +else() + # the following settings are modified from cub/CMakeLists.txt + set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.") + set(CMAKE_CXX_EXTENSIONS OFF) + + message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") + + # Force CUDA C++ standard to be the same as the C++ standard used. + # + # Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597 + # which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11 + # versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03). + # In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly. + if(DEFINED CMAKE_CUDA_STANDARD) + message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD" + " is used as the C++ standard version for both C++ and CUDA.") + endif() + unset(CMAKE_CUDA_STANDARD CACHE) + set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) + + set(K2_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72) + foreach(COMPUTE_ARCH IN LISTS K2_COMPUTE_ARCHS) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}") + endforeach() endif() -unset(CMAKE_CUDA_STANDARD CACHE) -set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) - -set(K2_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72) -foreach(COMPUTE_ARCH IN LISTS K2_COMPUTE_ARCHS) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}") -endforeach() - -#[[ end settings for CUB ]] enable_testing() diff --git a/k2/csrc/cuda/debug.h b/k2/csrc/cuda/debug.h index f7039a434..aa0b2f783 100644 --- a/k2/csrc/cuda/debug.h +++ b/k2/csrc/cuda/debug.h @@ -22,8 +22,8 @@ #include #include -#include -#include +#include +#include namespace k2 { @@ -71,27 +71,28 @@ namespace k2 { /** * @fn * __host__ __device__ __forceinline__ cudaError_t - * _K2CudaDebug(cudaError_t error, + * K2CudaDebug_(cudaError_t error, * const char *filename, * int line) * * @brief This is an error checking function, with context information. * It's not designed to called by users, but inner macros. - * It's made used by other macros. * * @param[in] error an enum type indicating CUDA errors. * @param[in] filename the source filename that the error comes from. * @param[in] line the code line that the error happened. - * @return Pass the input CUDA error. + * @param[in] abort this bool control if the error results into `abort` + * @return the input CUDA error. * * @code{.cpp} - * _K2CudaDebug(cudaGetLastError(), __FILE__, __LINE__); + * K2CudaDebug_(cudaGetLastError(), __FILE__, __LINE__); * @endcode */ -__host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( +__host__ __device__ __forceinline__ cudaError_t K2CudaDebug_( cudaError_t error, const char *filename, - int line) { + int line, + bool abort = true) { if (cudaSuccess != error) { #ifndef __CUDA_ARCH__ fprintf(stderr, "CUDA error ID=%d, NAME=%s, [%s, %d]: %s\n", @@ -99,6 +100,9 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( filename, line, cudaGetErrorString(error)); fflush(stderr); + if (abort) { + exit(error); + } #elif __CUDA_ARCH__ >= 200 printf("CUDA error ID=%d, NAME=%s, " "[block (%d,%d,%d) thread (%d,%d,%d), %s, %d]: %s\n", @@ -107,6 +111,16 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( threadIdx.x, threadIdx.y, threadIdx.z, filename, line, cudaGetErrorString(error)); + if (abort) { + __threadfence(); // ensure memory write before trap + /** + * kill kernel (all threads) with error. + * It may cause context destructed. + * `assert(cudaSuccess != error)` + * is another candidate. + */ + asm("trap;"); + } #endif } return error; @@ -118,19 +132,19 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( * @brief Macro for checking cuda error. * * @details - * If error is not cudaSuccess, print the error message and exit with the - * error enum value. Otherwise, it does nothing. + * If error is not cudaSuccess, print the error message, and pass the + * optional `bAbort` as `abort` of `K2CudaDebug_`. + * Otherwise, it does nothing except return the error. * - * @param[in] e an enum type indicating CUDA errors. + * @param[in] e one in the enum type that indicates the CUDA error. + * @return the CUDA error returned by `K2CudaDebug_`. * * @code{.cpp} - * K2_CUDA_CHECK_ERROR(cudaGetLastError()); + * K2_CUDA_CHECK_ERROR(error = cudaGetLastError()); * @endcode */ -#define K2_CUDA_CHECK_ERROR(e) \ - if (::k2::_K2CudaDebug((cudaError_t)(e), __FILE__, __LINE__)) { \ - exit(e); \ - } +#define K2_CUDA_CHECK_ERROR(e, bAbort...) \ + ::k2::K2CudaDebug_((cudaError_t)(e), __FILE__, __LINE__, ##bAbort) /** * @def K2_CUDA_SAFE_CALL([cuda_runtime_api|kernel]) @@ -139,8 +153,10 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( * or "kernels" return status. * * @details - * - If api return status is not cudaSuccess, print and exit. * - The `cudaDeviceSynchronize` only happens when `NDEBUG` is not defined. + * - Use K2_CUDA_CHECK_ERROR(.., bAbort = true) to deal with the error. + * + * @param[in] * * @note * Kernel launches do not return any error code, thus checking should after it. @@ -152,17 +168,17 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( * @endcode */ #ifndef NDEBUG - #define K2_CUDA_SAFE_CALL(...) \ - do { \ - (__VA_ARGS__); \ - cudaDeviceSynchronize(); \ - K2_CUDA_CHECK_ERROR(cudaGetLastError()); \ + #define K2_CUDA_SAFE_CALL(...) \ + do { \ + (__VA_ARGS__); \ + cudaDeviceSynchronize(); \ + K2_CUDA_CHECK_ERROR(cudaGetLastError(), true); \ } while (0) #else - #define K2_CUDA_SAFE_CALL(...) \ - do { \ - (__VA_ARGS__); \ - K2_CUDA_CHECK_ERROR(cudaGetLastError()); \ + #define K2_CUDA_SAFE_CALL(...) \ + do { \ + (__VA_ARGS__); \ + K2_CUDA_CHECK_ERROR(cudaGetLastError(), true); \ } while (0) #endif @@ -208,14 +224,14 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug( * @endcode */ #ifdef K2_PARANOID - #define K2_PARANOID_ASSERT(exp, format, ...) \ - do { \ - if (exp) \ - (void)0; \ - else { \ - K2_DLOG(" [%s, %d] " format, __FILE__, __LINE__, ##__VA_ARGS__); \ - assert(exp); \ - } \ + #define K2_PARANOID_ASSERT(exp, format, ...) \ + do { \ + if (exp) \ + (void)0; \ + else { \ + K2_DLOG("Error [%s, %d] " format, __FILE__, __LINE__, ##__VA_ARGS__); \ + assert(exp); \ + } \ } while (0) #else #define K2_PARANOID_ASSERT(exp, format, ...) ((void) 0) diff --git a/k2/csrc/cuda/debug_test.cu b/k2/csrc/cuda/debug_test.cu index 566ae7a3c..27201fabf 100644 --- a/k2/csrc/cuda/debug_test.cu +++ b/k2/csrc/cuda/debug_test.cu @@ -4,8 +4,8 @@ // See ../../LICENSE for clarification regarding multiple authors -#include #include +#include #ifndef K2_PARANOID #define K2_PARANOID @@ -56,7 +56,7 @@ __global__ void HelloCUDA(float f) { } // A vector add kernel definition -__global__ void VecAdd(float *A, float *B, float *C) { +__global__ void VecAdd(const float *A, const float *B, float *C) { int i = threadIdx.x; C[i] = A[i] + B[i]; K2_PARANOID_ASSERT(C[i] == A[i] + B[i], @@ -80,7 +80,7 @@ TEST(DebugTest, K2CheckEq) { TEST(DebugTest, K2CudaCheckError) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH(K2_CUDA_CHECK_ERROR(cudaErrorMemoryAllocation), + ASSERT_DEATH(K2_CUDA_CHECK_ERROR(cudaErrorMemoryAllocation, true), "cudaErrorMemoryAllocation"); } @@ -128,7 +128,8 @@ TEST(DebugTest, K2DLog) { HelloCUDA<<<1, 5>>>(1.2345f); FillContents<<<3, 2>>>(1, d_A); cudaDeviceSynchronize(); - K2_CUDA_CHECK_ERROR(cudaGetLastError()); + auto error = K2_CUDA_CHECK_ERROR(cudaGetLastError()); + EXPECT_EQ(error, cudaSuccess); } cudaFree(d_A); @@ -137,19 +138,12 @@ TEST(DebugTest, K2DLog) { TEST(DebugTest, K2ParanoidAssert) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - { - - ASSERT_DEATH( - K2_PARANOID_ASSERT(2 < 1, "2 unexpectedly smaller than 1\n"), - "Assertion"); - } - { int a = 2; int b = 1; ASSERT_DEATH( K2_PARANOID_ASSERT(a < b, "%d unexpectedly smaller than %d\n", a, b), - "Assertion"); + "Assertion `a < b' failed"); } {