From 4a301ac9abdcce93aa2d341f2f3f37f271a2abb6 Mon Sep 17 00:00:00 2001
From: Meixu Song <songmeixu@outlook.com>
Date: Mon, 31 Aug 2020 20:19:07 +0800
Subject: [PATCH] update to review (#103)

* update to review

- fix a twice error code checking issue
- change macro name `K2_CUDA_CHECK_ERROR` to `K2_CUDA_PRINT_ERROR` as its effect
- add a optional arg `abort` to control abort if hit error
- trivals
fix

* just revert name: `K2_CUDA_CHECK_ERROR`
---
 CMakeLists.txt             | 65 +++++++++++++++++------------
 k2/csrc/cuda/debug.h       | 84 +++++++++++++++++++++++---------------
 k2/csrc/cuda/debug_test.cu | 18 +++-----
 3 files changed, 95 insertions(+), 72 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 331bd1ad7..8f1e748e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,33 +43,46 @@ if(WIN32 AND BUILD_SHARED_LIBS)
   set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
 endif()
 
-# the following settings are modified from cub/CMakeLists.txt
-#[[ start settings for CUB ]]
-
-set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
-
-# Force CUDA C++ standard to be the same as the C++ standard used.
-#
-# Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
-# which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
-# versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
-# In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
-if(DEFINED CMAKE_CUDA_STANDARD)
-  message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
-    " is used as the C++ standard version for both C++ and CUDA.")
+if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
+  find_package(CUDAToolkit REQUIRED)
+  if(CUDAToolkit_FOUND)
+    message(STATUS "found CUDAToolkit " ${CUDAToolkit_LIBRARY_DIR})
+    message(STATUS "CUDAToolkit_INCLUDE_DIRS " ${CUDAToolkit_INCLUDE_DIRS})
+    message(STATUS "CUDAToolkit_LIBRARY_DIR " ${CUDAToolkit_LIBRARY_DIR})
+
+    enable_language(CUDA)
+
+    # With many architectures set here, the nvcc build time would be much longer.
+    # Thus, "61" is put here for speed and compatibility.
+    # @ToDo Need to cover more architectures, before release these code.
+    set(CMAKE_CUDA_ARCHITECTURES 61)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+  endif()
+else()
+  # the following settings are modified from cub/CMakeLists.txt
+  set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
+  set(CMAKE_CXX_EXTENSIONS OFF)
+
+  message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
+
+  # Force CUDA C++ standard to be the same as the C++ standard used.
+  #
+  # Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
+  # which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
+  # versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
+  # In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
+  if(DEFINED CMAKE_CUDA_STANDARD)
+    message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
+      " is used as the C++ standard version for both C++ and CUDA.")
+  endif()
+  unset(CMAKE_CUDA_STANDARD CACHE)
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+
+  set(K2_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72)
+  foreach(COMPUTE_ARCH IN LISTS K2_COMPUTE_ARCHS)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
+  endforeach()
 endif()
-unset(CMAKE_CUDA_STANDARD CACHE)
-set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
-
-set(K2_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72)
-foreach(COMPUTE_ARCH IN LISTS K2_COMPUTE_ARCHS)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
-endforeach()
-
-#[[ end settings for CUB ]]
 
 enable_testing()
 
diff --git a/k2/csrc/cuda/debug.h b/k2/csrc/cuda/debug.h
index f7039a434..aa0b2f783 100644
--- a/k2/csrc/cuda/debug.h
+++ b/k2/csrc/cuda/debug.h
@@ -22,8 +22,8 @@
 #include <device_launch_parameters.h>
 #include <driver_types.h>
 
-#include <stdio.h>
-#include <assert.h>
+#include <cstdio>
+#include <cassert>
 
 namespace k2 {
 
@@ -71,27 +71,28 @@ namespace k2 {
 /**
  * @fn
  *  __host__ __device__ __forceinline__ cudaError_t
- *  _K2CudaDebug(cudaError_t error,
+ *  K2CudaDebug_(cudaError_t error,
  *               const char *filename,
  *               int line)
  *
  * @brief This is an error checking function, with context information.
  *        It's not designed to called by users, but inner macros.
- *        It's made used by other macros.
  *
  * @param[in] error         an enum type indicating CUDA errors.
  * @param[in] filename      the source filename that the error comes from.
  * @param[in] line          the code line that the error happened.
- * @return                  Pass the input CUDA error.
+ * @param[in] abort         this bool control if the error results into `abort`
+ * @return                  the input CUDA error.
  *
  * @code{.cpp}
- * _K2CudaDebug(cudaGetLastError(), __FILE__, __LINE__);
+ * K2CudaDebug_(cudaGetLastError(), __FILE__, __LINE__);
  * @endcode
  */
-__host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
+__host__ __device__ __forceinline__ cudaError_t K2CudaDebug_(
     cudaError_t error,
     const char *filename,
-    int line) {
+    int line,
+    bool abort = true) {
   if (cudaSuccess != error) {
   #ifndef __CUDA_ARCH__
     fprintf(stderr, "CUDA error ID=%d, NAME=%s, [%s, %d]: %s\n",
@@ -99,6 +100,9 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
             filename, line,
             cudaGetErrorString(error));
     fflush(stderr);
+    if (abort) {
+      exit(error);
+    }
   #elif __CUDA_ARCH__ >= 200
     printf("CUDA error ID=%d, NAME=%s, "
            "[block (%d,%d,%d) thread (%d,%d,%d), %s, %d]: %s\n",
@@ -107,6 +111,16 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
            threadIdx.x, threadIdx.y, threadIdx.z,
            filename, line,
            cudaGetErrorString(error));
+    if (abort) {
+      __threadfence();         // ensure memory write before trap
+      /**
+       * kill kernel (all threads) with error.
+       * It may cause context destructed.
+       * `assert(cudaSuccess != error)`
+       * is another candidate.
+       */
+      asm("trap;");
+    }
   #endif
   }
   return error;
@@ -118,19 +132,19 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
  * @brief Macro for checking cuda error.
  *
  * @details
- *  If error is not cudaSuccess, print the error message and exit with the
- *  error enum value. Otherwise, it does nothing.
+ *  If error is not cudaSuccess, print the error message, and pass the
+ *  optional `bAbort` as `abort` of `K2CudaDebug_`.
+ *  Otherwise, it does nothing except return the error.
  *
- * @param[in] e an enum type indicating CUDA errors.
+ * @param[in] e one in the enum type that indicates the CUDA error.
+ * @return      the CUDA error returned by `K2CudaDebug_`.
  *
  * @code{.cpp}
- * K2_CUDA_CHECK_ERROR(cudaGetLastError());
+ * K2_CUDA_CHECK_ERROR(error = cudaGetLastError());
  * @endcode
  */
-#define K2_CUDA_CHECK_ERROR(e)                                    \
-  if (::k2::_K2CudaDebug((cudaError_t)(e), __FILE__, __LINE__)) { \
-    exit(e);                                                      \
-  }
+#define K2_CUDA_CHECK_ERROR(e, bAbort...)                            \
+  ::k2::K2CudaDebug_((cudaError_t)(e), __FILE__, __LINE__, ##bAbort)
 
 /**
  * @def K2_CUDA_SAFE_CALL([cuda_runtime_api|kernel])
@@ -139,8 +153,10 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
  *        or "kernels" return status.
  *
  * @details
- *  - If api return status is not cudaSuccess, print and exit.
  *  - The `cudaDeviceSynchronize` only happens when `NDEBUG` is not defined.
+ *  - Use K2_CUDA_CHECK_ERROR(.., bAbort = true) to deal with the error.
+ *
+ * @param[in]
  *
  * @note
  *  Kernel launches do not return any error code, thus checking should after it.
@@ -152,17 +168,17 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
  * @endcode
  */
 #ifndef NDEBUG
-  #define K2_CUDA_SAFE_CALL(...)               \
-    do {                                       \
-      (__VA_ARGS__);                           \
-      cudaDeviceSynchronize();                 \
-      K2_CUDA_CHECK_ERROR(cudaGetLastError()); \
+  #define K2_CUDA_SAFE_CALL(...)                     \
+    do {                                             \
+      (__VA_ARGS__);                                 \
+      cudaDeviceSynchronize();                       \
+      K2_CUDA_CHECK_ERROR(cudaGetLastError(), true); \
     } while (0)
 #else
-  #define K2_CUDA_SAFE_CALL(...)               \
-    do {                                       \
-      (__VA_ARGS__);                           \
-      K2_CUDA_CHECK_ERROR(cudaGetLastError()); \
+  #define K2_CUDA_SAFE_CALL(...)                     \
+    do {                                             \
+      (__VA_ARGS__);                                 \
+      K2_CUDA_CHECK_ERROR(cudaGetLastError(), true); \
     } while (0)
 #endif
 
@@ -208,14 +224,14 @@ __host__ __device__ __forceinline__ cudaError_t _K2CudaDebug(
  * @endcode
  */
 #ifdef K2_PARANOID
-  #define K2_PARANOID_ASSERT(exp, format, ...)                           \
-    do {                                                                 \
-      if (exp)                                                           \
-        (void)0;                                                         \
-      else {                                                             \
-        K2_DLOG(" [%s, %d] " format, __FILE__, __LINE__, ##__VA_ARGS__); \
-        assert(exp);                                                     \
-      }                                                                  \
+  #define K2_PARANOID_ASSERT(exp, format, ...)                                \
+    do {                                                                      \
+      if (exp)                                                                \
+        (void)0;                                                              \
+      else {                                                                  \
+        K2_DLOG("Error [%s, %d] " format, __FILE__, __LINE__, ##__VA_ARGS__); \
+        assert(exp);                                                          \
+      }                                                                       \
     } while (0)
 #else
   #define K2_PARANOID_ASSERT(exp, format, ...) ((void) 0)
diff --git a/k2/csrc/cuda/debug_test.cu b/k2/csrc/cuda/debug_test.cu
index 566ae7a3c..27201fabf 100644
--- a/k2/csrc/cuda/debug_test.cu
+++ b/k2/csrc/cuda/debug_test.cu
@@ -4,8 +4,8 @@
 
 // See ../../LICENSE for clarification regarding multiple authors
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <gmock/gmock.h>
 
 #ifndef K2_PARANOID
 #define K2_PARANOID
@@ -56,7 +56,7 @@ __global__ void HelloCUDA(float f) {
 }
 
 // A vector add kernel definition
-__global__ void VecAdd(float *A, float *B, float *C) {
+__global__ void VecAdd(const float *A, const float *B, float *C) {
   int i = threadIdx.x;
   C[i] = A[i] + B[i];
   K2_PARANOID_ASSERT(C[i] == A[i] + B[i],
@@ -80,7 +80,7 @@ TEST(DebugTest, K2CheckEq) {
 
 TEST(DebugTest, K2CudaCheckError) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  ASSERT_DEATH(K2_CUDA_CHECK_ERROR(cudaErrorMemoryAllocation),
+  ASSERT_DEATH(K2_CUDA_CHECK_ERROR(cudaErrorMemoryAllocation, true),
                "cudaErrorMemoryAllocation");
 }
 
@@ -128,7 +128,8 @@ TEST(DebugTest, K2DLog) {
     HelloCUDA<<<1, 5>>>(1.2345f);
     FillContents<<<3, 2>>>(1, d_A);
     cudaDeviceSynchronize();
-    K2_CUDA_CHECK_ERROR(cudaGetLastError());
+    auto error = K2_CUDA_CHECK_ERROR(cudaGetLastError());
+    EXPECT_EQ(error, cudaSuccess);
   }
 
   cudaFree(d_A);
@@ -137,19 +138,12 @@ TEST(DebugTest, K2DLog) {
 TEST(DebugTest, K2ParanoidAssert) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
-  {
-
-    ASSERT_DEATH(
-        K2_PARANOID_ASSERT(2 < 1, "2 unexpectedly smaller than 1\n"),
-        "Assertion");
-  }
-
   {
     int a = 2;
     int b = 1;
     ASSERT_DEATH(
         K2_PARANOID_ASSERT(a < b, "%d unexpectedly smaller than %d\n", a, b),
-        "Assertion");
+        "Assertion `a < b' failed");
   }
 
   {