diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index 2140c2e63..dbb26f33f 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -475,9 +475,7 @@ namespace core { #ifdef __CUDACC_RDC__ return core::get_agent_plan(s, d_ptr); #else - core::cuda_optional ptx_version = core::get_ptx_version(); - //CUDA_CUB_RET_IF_FAIL(ptx_version.status()); - return get_agent_plan(ptx_version); + return get_agent_plan(core::get_ptx_version()); #endif } THRUST_RUNTIME_FUNCTION @@ -491,8 +489,7 @@ namespace core { typename core::get_plan::type static get_plan(cudaStream_t , void* d_ptr = 0) { THRUST_UNUSED_VAR(d_ptr); - core::cuda_optional ptx_version = core::get_ptx_version(); - return get_agent_plan(ptx_version); + return get_agent_plan(core::get_ptx_version()); } THRUST_RUNTIME_FUNCTION @@ -528,7 +525,7 @@ namespace core { { #if THRUST_DEBUG_SYNC_FLAG cuda_optional occ = max_sm_occupancy(k); - core::cuda_optional ptx_version = core::get_ptx_version(); + const int ptx_version = core::get_ptx_version(); if (count > 0) { _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %llu items total, %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version \n", diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h index 11efc0858..e2f5f8299 100644 --- a/thrust/system/cuda/detail/core/util.h +++ b/thrust/system/cuda/detail/core/util.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -615,11 +616,57 @@ namespace core { }; THRUST_RUNTIME_FUNCTION - inline cuda_optional get_ptx_version() + inline int get_ptx_version() { int ptx_version = 0; - cudaError_t status = cub::PtxVersion(ptx_version); - return cuda_optional(ptx_version, status); + if (cub::PtxVersion(ptx_version) != cudaSuccess) + { + // Failure might mean that there's no device found + const int current_device = cub::CurrentDevice(); + if (current_device < 0) + { + cuda_cub::throw_on_error(cudaErrorNoDevice, "No GPU is available\n"); + } + + // Any subsequent failure means the provided device binary does not match + // the generated function code + int major = 0, minor = 0; + cudaError_t attr_status; + + attr_status = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device); + cuda_cub::throw_on_error(attr_status, + "get_ptx_version :" + "failed to get major CUDA device compute capability version."); + + attr_status = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device); + cuda_cub::throw_on_error(attr_status, + "get_ptx_version :" + "failed to get minor CUDA device compute capability version."); + + // Index from which SM code has to start in the message below + int code_offset = 37; + char str[] = "This program was not compiled for SM \n"; + + auto print_1_helper = [&](int v) { + str[code_offset] = static_cast(v) + '0'; + code_offset++; + }; + + // Assume two digits will be enough + auto print_2_helper = [&](int v) { + if (v / 10 != 0) { + print_1_helper(v / 10); + } + print_1_helper(v % 10); + }; + + print_2_helper(major); + print_2_helper(minor); + + cuda_cub::throw_on_error(cudaErrorInvalidDevice, str); + } + + return ptx_version; } THRUST_RUNTIME_FUNCTION