diff --git a/config/component_macros.cmake b/config/component_macros.cmake index b5cb179985..0291fd4f49 100644 --- a/config/component_macros.cmake +++ b/config/component_macros.cmake @@ -253,6 +253,7 @@ endmacro( add_component_executable ) # SOURCES "file1.cc;file2.cc;..." # HEADERS "file1.hh;file2.hh;..." # LIBRARY_NAME_PREFIX "rtt_" +# LIBRARY_TYPE "SHARED" # VENDOR_LIST "MPI;GSL" # VENDOR_LIBS "${MPI_CXX_LIBRARIES};${GSL_LIBRARIES}" # VENDOR_INCLUDE_DIRS "${MPI_CXX_INCLUDE_DIR};${GSL_INCLUDE_DIR}" @@ -279,7 +280,7 @@ macro( add_component_library ) cmake_parse_arguments( acl "NOEXPORT" - "PREFIX;TARGET;LIBRARY_NAME;LIBRARY_NAME_PREFIX;LINK_LANGUAGE" + "PREFIX;TARGET;LIBRARY_NAME;LIBRARY_NAME_PREFIX;LIBRARY_TYPE;LINK_LANGUAGE" "HEADERS;SOURCES;TARGET_DEPS;VENDOR_LIST;VENDOR_LIBS;VENDOR_INCLUDE_DIRS" ${ARGV} ) @@ -305,6 +306,11 @@ macro( add_component_library ) endif() endif() + # if a library type was not specified use the default Draco setting + if(NOT acl_LIBRARY_TYPE) + set( acl_LIBRARY_TYPE ${DRACO_LIBRARY_TYPE}) + endif() + # # Create the library and set the properties # @@ -314,7 +320,7 @@ macro( add_component_library ) # extract project name, minus leading "Lib_" string( REPLACE "Lib_" "" folder_name ${acl_TARGET} ) - add_library( ${acl_TARGET} ${DRACO_LIBRARY_TYPE} ${acl_SOURCES} ) + add_library( ${acl_TARGET} ${acl_LIBRARY_TYPE} ${acl_SOURCES} ) # Some properties are set at a global scope in compilerEnv.cmake: # - C_STANDARD, C_EXTENSIONS, CXX_STANDARD, CXX_EXTENSIONS, # CXX_STANDARD_REQUIRED, and POSITION_INDEPENDENT_CODE diff --git a/config/query_gpu.cu b/config/query_gpu.cu new file mode 100644 index 0000000000..34d64d01d3 --- /dev/null +++ b/config/query_gpu.cu @@ -0,0 +1,38 @@ +//----------------------------------*-C++-*----------------------------------// +/*! + * \file config/query_gpu.cu + * \author Alex Long + * \brief Small CUDA code that prints the architecture version, used by CMake + * \date Thu Mat 21 15:53:51 2019 + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. */ +//---------------------------------------------------------------------------// + +// NOTE: This code is from +// wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake + +#include + +int main(int argc, char **argv) { + cudaDeviceProp dP; + float min_cc = 3.0; + + int rc = cudaGetDeviceProperties(&dP, 0); + if (rc != cudaSuccess) { + cudaError_t error = cudaGetLastError(); + printf("CUDA error: %s", cudaGetErrorString(error)); + return rc; /* Failure */ + } + if ((dP.major + (dP.minor / 10)) < min_cc) { + printf("Min Compute Capability of %2.1f required: %d.%d found\n Not " + "Building CUDA Code", + min_cc, dP.major, dP.minor); + return 1; /* Failure */ + } else { + printf("-arch=sm_%d%d", dP.major, dP.minor); + return 0; /* Success */ + } +} +//---------------------------------------------------------------------------// +// end of query_gpu.cu +//---------------------------------------------------------------------------// diff --git a/config/vendor_libraries.cmake b/config/vendor_libraries.cmake index 285ad5d450..60a6af72e1 100644 --- a/config/vendor_libraries.cmake +++ b/config/vendor_libraries.cmake @@ -295,36 +295,29 @@ macro( setupCudaEnv ) add_feature_info( Cuda WITH_CUDA "Build CUDA kernels for GPU compute.") - if( WITH_CUDA ) + if( WITH_CUDA AND NOT DEFINED CUDA_DBS_STRING ) set( CUDA_DBS_STRING "CUDA" CACHE BOOL "If CUDA is available, this variable is 'CUDA'") - # message(" - # CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - # CMAKE_CUDA_HOST_COMPILER = ${CMAKE_CUDA_HOST_COMPILER} - # CMAKE_GENERATOR_TOOLSET = ${CMAKE_GENERATOR_TOOLSET} - # CMAKE_VS_PLATFORM_TOOLSET_CUDA = ${CMAKE_VS_PLATFORM_TOOLSET_CUDA} - # CUDA_EXTENSIONS = ${CUDA_EXTENSIONS} - # CUDAHOSTCXX = ${CUDAHOSTCXX} - # CUDAFLAGS = ${CUDAFLAGS} - # CUDACXX = ${CUDACXX} - # CUDA_STANDARD = ${CUDA_STANDARD} - # CUDA_SEPARABLE_COMPILATION = ${CUDA_SEPARABLE_COMPILATION} - # CUDA_RESOLVE_DEVICE_SYMBOLS = ${CUDA_RESOLVE_DEVICE_SYMBOLS} - # CUDA_PTX_COMPILATION = ${CUDA_PTX_COMPILATION} - # ") - - # $ENV{CUDACXX} - # $ENV{CUDAFLAGS} - # $ENV{CUDAHOSTCXX} - - # target properties - # - CUDA_EXTENSIONS - # - CUDA_PTX_COMPILATION - # - CUDA_RESOLVE_DEVICE_SYMBOLS - # - CUDA_SEPARABLE_COMPILATION - # - CUDA_STANDARD - # - CUDA_STANDARD_REQUIRED + set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/config/cuda_script) # No suffix required + set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/config/query_gpu.cu) + execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE}) + execute_process(COMMAND ${OUTPUTFILE} + RESULT_VARIABLE CUDA_RETURN_CODE OUTPUT_VARIABLE ARCH) + + if (${CUDA_RETURN_CODE EQUAL 0}) + message(STATUS "CUDA Architecture: ${ARCH}") + set(CMAKE_CUDA_FLAGS "${ARCH} -g -G" CACHE STRING + set(CMAKE_CUDA_FLAGS_DEBUG "-O0" CACHE STRING + "CUDA debug flags" FORCE) + "CUDA debug flags" FORCE) + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 --generate-line-info" CACHE STRING + "CUDA release with debug information flags" FORCE) + set(CMAKE_CUDA_FLAGS_RELEASE "-O2" CACHE STRING + "CUDA release flags" FORCE) + else() + message(WARNING ${ARCH}) + endif() endif() endmacro() diff --git a/src/device/GPU_Device.cc b/src/device/GPU_Device.cc index bc2c1e82e5..2670a1b1cd 100644 --- a/src/device/GPU_Device.cc +++ b/src/device/GPU_Device.cc @@ -9,6 +9,7 @@ //---------------------------------------------------------------------------// #include "GPU_Device.hh" +#include #include namespace rtt_device { @@ -26,12 +27,9 @@ namespace rtt_device { */ GPU_Device::GPU_Device(void) : deviceCount(0), computeCapability(), deviceName() { - // Initialize the library - cudaError_enum err = cuInit(0); // currently must be 0. - checkForCudaError(err); // Get a device count, determine compute capability - err = cuDeviceGetCount(&deviceCount); + cudaError_t err = cudaGetDeviceCount(&deviceCount); checkForCudaError(err); Insist(deviceCount > 0, "No GPU devices found!"); @@ -39,130 +37,65 @@ GPU_Device::GPU_Device(void) computeCapability.resize(deviceCount); for (int device = 0; device < deviceCount; device++) { - CUdevice cuDevice; - err = cuDeviceGet(&cuDevice, device); + int cudaDevice; + err = cudaSetDevice(device); + checkForCudaError(err); + err = cudaGetDevice(&cudaDevice); checkForCudaError(err); + Check(cudaDevice == device); // Compute capability revision int major = 0; int minor = 0; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - cuDevice); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - cuDevice); + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device); + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device); computeCapability[device].push_back(major); computeCapability[device].push_back(minor); // Device name - char name[200]; - err = cuDeviceGetName(name, 200, cuDevice); + cudaDeviceProp device_properties; + err = cudaGetDeviceProperties(&device_properties, device); checkForCudaError(err); - deviceName.push_back(std::string(name)); + deviceName.push_back(std::string(device_properties.name)); // Query and archive device properties. { - int tmp1(0), tmp2(0), tmp3(0); - err = cuDeviceGetAttribute( - &tmp1, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuDevice); - checkForCudaError(err); - m_maxthreadsperblock.push_back(tmp1); - - err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - cuDevice); - checkForCudaError(err); - err = cuDeviceGetAttribute(&tmp2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - cuDevice); - checkForCudaError(err); - err = cuDeviceGetAttribute(&tmp3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - cuDevice); - checkForCudaError(err); - m_maxthreadsdim.push_back(std::array{tmp1, tmp2, tmp3}); - - err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - cuDevice); - checkForCudaError(err); - err = cuDeviceGetAttribute(&tmp2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - cuDevice); - checkForCudaError(err); - err = cuDeviceGetAttribute(&tmp3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - cuDevice); - checkForCudaError(err); - m_maxgridsize.push_back(std::array{tmp1, tmp2, tmp3}); - - err = cuDeviceGetAttribute( - &tmp1, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cuDevice); - checkForCudaError(err); - m_sharedmemperblock.push_back(tmp1); - - err = cuDeviceGetAttribute( - &tmp1, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuDevice); - checkForCudaError(err); - m_totalconstantmemory.push_back(tmp1); + int tmp(0); + m_maxthreadsperblock.push_back(device_properties.maxThreadsPerBlock); + m_maxthreadsdim.push_back( + std::array{device_properties.maxThreadsDim[0], + device_properties.maxThreadsDim[1], + device_properties.maxThreadsDim[2]}); + m_maxgridsize.push_back(std::array{ + device_properties.maxGridSize[0], device_properties.maxGridSize[1], + device_properties.maxGridSize[2]}); + m_sharedmemperblock.push_back(device_properties.sharedMemPerBlock); err = - cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); + cudaDeviceGetAttribute(&tmp, cudaDevAttrTotalConstantMemory, device); checkForCudaError(err); - m_simdwidth.push_back(tmp1); + m_totalconstantmemory.push_back(tmp); - err = - cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_PITCH, cuDevice); - checkForCudaError(err); - m_mempitch.push_back(tmp1); - - err = cuDeviceGetAttribute( - &tmp1, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, cuDevice); - checkForCudaError(err); - m_regsperblock.push_back(tmp1); + m_simdwidth.push_back(device_properties.warpSize); + m_mempitch.push_back(device_properties.memPitch); err = - cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuDevice); + cudaDeviceGetAttribute(&tmp, cudaDevAttrMaxRegistersPerBlock, device); checkForCudaError(err); - m_clockrate.push_back(tmp1); + m_regsperblock.push_back(tmp); - err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - cuDevice); + err = cudaDeviceGetAttribute(&tmp, cudaDevAttrClockRate, device); checkForCudaError(err); - m_texturealign.push_back(tmp1); - } - - // CUdevprop_st properties; - // err = cuDeviceGetProperties(&properties, cuDevice); - // checkForCudaError(err); - // deviceProperties.push_back(properties); - } + m_clockrate.push_back(tmp); - // Save the handle and context for each device - device_handle.resize(2); - context.resize(2); - for (int device = 0; device < deviceCount; device++) { - // Only initialize if compute capability >= 2.0 - if (computeCapability[device][0] >= 2) { - // Save the handle for each device - err = cuDeviceGet(&device_handle[device], device); - checkForCudaError(err); - - // Save the handle for each context - err = cuCtxCreate(&context[device], device, device_handle[device]); + err = cudaDeviceGetAttribute(&tmp, cudaDevAttrTextureAlignment, device); checkForCudaError(err); + m_texturealign.push_back(tmp); } } } -/*! - * \brief destructor - * - * Free the device context and unload any modules. - */ -GPU_Device::~GPU_Device() { - // Free reserved contexts: - for (int device = 0; device < deviceCount; device++) { - // Only initialize if compute capability >= 2.0 - if (computeCapability[device][0] >= 2) { - cudaError_enum err = cuCtxDestroy(context[device]); - checkForCudaError(err); - } - } -} +GPU_Device::~GPU_Device() {} //---------------------------------------------------------------------------// // Print a summary of all GPU devices found @@ -205,11 +138,11 @@ void GPU_Device::printDeviceSummary(int const idevice, * For optimized builds with DRACO_DBC_LEVEL=0, this function will be empty * and any decent compiler will optimize this call away. */ -void GPU_Device::checkForCudaError(cudaError_enum const errorCode) { +void GPU_Device::checkForCudaError(cudaError_t const errorCode) { std::ostringstream msg; msg << "A CUDA call returned the error: \"" << getErrorMessage(errorCode) << "\""; - Insist(errorCode == CUDA_SUCCESS, msg.str()); + Insist(errorCode == cudaSuccess, msg.str()); } #else @@ -218,7 +151,7 @@ void GPU_Device::checkForCudaError(cudaError_enum const errorCode) { * \brief Convert a CUDA return enum value into a descriptive string. * \return descriptive string associated with */ -void GPU_Device::checkForCudaError(cudaError_enum const) { /* empty */ +void GPU_Device::checkForCudaError(cudaError_t const) { /* empty */ } #endif @@ -226,144 +159,133 @@ void GPU_Device::checkForCudaError(cudaError_enum const) { /* empty */ /*! * \brief Return a text string that corresponds to a CUDA error enum. */ -std::string GPU_Device::getErrorMessage(cudaError_enum const err) { - std::string message; +std::string GPU_Device::getErrorMessage(cudaError_t const err) { + auto raw_message = cudaGetErrorString(err); + std::string message(raw_message); + + /* switch (err) { - case CUDA_SUCCESS: + case cudaSuccess: message = std::string("No errors."); break; - case CUDA_ERROR_INVALID_VALUE: + case cudaErrorInvalidValue: message = std::string("Invalid value."); break; - case CUDA_ERROR_OUT_OF_MEMORY: + case cudaErrorMemoryAllocation: message = std::string("Out of memory."); break; - case CUDA_ERROR_NOT_INITIALIZED: + case cudaErrorInitializationError: message = std::string("Driver not initialized."); break; - case CUDA_ERROR_DEINITIALIZED: + case cudaErrorDeviceUninitilialized: message = std::string("Driver deinitialized."); break; - case CUDA_ERROR_NO_DEVICE: + case cudaErrorNoDevice: message = std::string("No CUDA-capable device available."); break; - case CUDA_ERROR_INVALID_DEVICE: + case cudaErrorInvalidDevice: message = std::string("Invalid device."); break; - case CUDA_ERROR_INVALID_IMAGE: + case cudaErrorInvalidKernelImage: message = std::string("Invalid kernel image."); break; - case CUDA_ERROR_INVALID_CONTEXT: + case cudaErrorIncompatibleDriverContext: message = std::string("Invalid context."); break; - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - message = std::string("Context already current."); - break; - case CUDA_ERROR_MAP_FAILED: + case cudaErrorMapBufferObjectFailed: message = std::string("Map failed."); break; - case CUDA_ERROR_UNMAP_FAILED: + case cudaErrorUnmapBufferObjectFailed: message = std::string("Unmap failed."); break; - case CUDA_ERROR_ARRAY_IS_MAPPED: + case cudaErrorArrayIsMapped: message = std::string("Array is mapped."); break; - case CUDA_ERROR_ALREADY_MAPPED: + case cudaErrorAlreadyMapped: message = std::string("Already mapped."); break; - case CUDA_ERROR_NO_BINARY_FOR_GPU: - message = std::string("No binary for GPU."); - break; - case CUDA_ERROR_ALREADY_ACQUIRED: + case cudaErrorAlreadyAcquired: message = std::string("Already acquired."); break; - case CUDA_ERROR_NOT_MAPPED: + case cudaErrorNotMapped: message = std::string("Not mapped."); break; - case CUDA_ERROR_INVALID_SOURCE: + case cudaErrorInvalidSource: message = std::string("Invalid source."); break; - case CUDA_ERROR_FILE_NOT_FOUND: + case cudaErrorFileNotFound: message = std::string("File not found."); break; - case CUDA_ERROR_INVALID_HANDLE: + case cudaErrorInvalidResourceHandle: message = std::string("Invalid handle."); break; - case CUDA_ERROR_NOT_FOUND: - message = std::string("Not found."); - break; - case CUDA_ERROR_NOT_READY: + case cudaErrorNotReady: message = std::string("CUDA not ready."); break; - case CUDA_ERROR_LAUNCH_FAILED: + case cudaErrorLaunchFailure: message = std::string("Launch failed."); break; - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + case cudaErrorLaunchOutOfResources: message = std::string("Launch exceeded resources."); break; - case CUDA_ERROR_LAUNCH_TIMEOUT: + case cudaErrorLaunchTimeout: message = std::string("Launch exceeded timeout."); break; - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + case cudaErrorLaunchIncompatibleTexturing: message = std::string("Launch with incompatible texturing."); break; - case CUDA_ERROR_UNKNOWN: + case cudaErrorUnknown: message = std::string("Unknown error. "); break; default: - // CUDA_ERROR_PROFILER_DISABLED - // CUDA_ERROR_PROFILER_NOT_INITIALIZED - // CUDA_ERROR_PROFILER_ALREADY_STARTED - // CUDA_ERROR_PROFILER_ALREADY_STOPPED - // CUDA_ERROR_NOT_MAPPED_AS_ARRAY - // CUDA_ERROR_NOT_MAPPED_AS_POINTER - // CUDA_ERROR_ECC_UNCORRECTABLE - // CUDA_ERROR_UNSUPPORTED_LIMIT - // CUDA_ERROR_CONTEXT_ALREADY_IN_USE - // CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND - // CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - // CUDA_ERROR_OPERATING_SYSTEM - // CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED - // CUDA_ERROR_PEER_ACCESS_NOT_ENABLED - // CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED - // CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED - // CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE - // CUDA_ERROR_CONTEXT_IS_DESTROYED + // cudaErrorProfilerDisabled + // cudaErrorNotMappedAsArray + // cudaErrorNotMappedAsPointer + // cudaErrorECCUncorrectable + // cudaErrorUnsupportedLimit + // cudaErrorSharedObjectSymbolNotFound + // cudaErrorSharedObjectInitFailed + // cudaErrorOperatingSystem + // cudaErrorPeerAccessAlreadyEnabled + // cudaErrorPeerAccessUnsupported + // cudaErrorPeerAccessNotEnabled + // cudaErrorContextIsDestroyed message = std::string("Unknown error. "); break; } + */ return message; } //---------------------------------------------------------------------------// /*! - * \brief Wrap the cuMemAlloc funtion to include error checking + * \brief Wrap the cudaMemAlloc funtion to include error checking * * \param nbytes number of bytes to allocate (e.g.: len*sizeof(double) ). * \return GPU device pointer to allocated memory. */ -CUdeviceptr GPU_Device::MemAlloc(unsigned const nbytes) { - CUdeviceptr ptr; - cudaError_enum err = cuMemAlloc(&ptr, nbytes); +void *GPU_Device::MemAlloc(unsigned const nbytes) { + void *ptr; + cudaError_t err = cudaMalloc(&ptr, nbytes); checkForCudaError(err); return ptr; } -void GPU_Device::MemcpyHtoD(CUdeviceptr ptr, void const *loc, unsigned nbytes) { - cudaError_enum err = cuMemcpyHtoD(ptr, loc, nbytes); +void GPU_Device::MemcpyHtoD(void *ptr, void const *loc, unsigned nbytes) { + cudaError_t err = cudaMemcpy(ptr, loc, nbytes, cudaMemcpyHostToDevice); checkForCudaError(err); return; } -void GPU_Device::MemcpyDtoH(void *loc, CUdeviceptr ptr, unsigned nbytes) { - cudaError_enum err = cuMemcpyDtoH(loc, ptr, nbytes); +void GPU_Device::MemcpyDtoH(void *loc, void *ptr, unsigned nbytes) { + cudaError_t err = cudaMemcpy(loc, ptr, nbytes, cudaMemcpyDeviceToHost); checkForCudaError(err); return; } -void GPU_Device::MemFree(CUdeviceptr ptr) { - cudaError_enum err = cuMemFree(ptr); +void GPU_Device::MemFree(void *ptr) { + cudaError_t err = cudaFree(ptr); checkForCudaError(err); return; } diff --git a/src/device/GPU_Device.cmake b/src/device/GPU_Device.cmake index 372377c394..db22476f2e 100644 --- a/src/device/GPU_Device.cmake +++ b/src/device/GPU_Device.cmake @@ -15,6 +15,7 @@ set( TEST_KERNEL_BINDIR ${PROJECT_BINARY_DIR}/test CACHE PATH "GPU kernel binary install location" ) +set( CUDA_DEVICE ON) configure_file( config.h.in ${PROJECT_BINARY_DIR}/device/config.h ) # ---------------------------------------------------------------------------- # @@ -23,12 +24,12 @@ configure_file( config.h.in ${PROJECT_BINARY_DIR}/device/config.h ) set( sources GPU_Device.cc - GPU_Module.cc ) + ) set( headers GPU_Device.hh - GPU_Module.hh ${PROJECT_BINARY_DIR}/device/config.h - device_cuda.h ) + device_cuda.h + ) # ---------------------------------------------------------------------------- # # Build package library @@ -38,11 +39,14 @@ add_component_library( TARGET Lib_device TARGET_DEPS Lib_dsxx LIBRARY_NAME device + LIBRARY_TYPE STATIC SOURCES "${sources}" HEADERS "${headers}" ) target_include_directories( Lib_device PUBLIC $ - PUBLIC $ ) + PUBLIC $ + ) +set_property(TARGET Lib_device PROPERTY CUDA_SEPARABLE_COMPILATION ON) # ---------------------------------------------------------------------------- # # Installation instructions diff --git a/src/device/GPU_Device.hh b/src/device/GPU_Device.hh index 45189d76a2..f7a76e7d56 100644 --- a/src/device/GPU_Device.hh +++ b/src/device/GPU_Device.hh @@ -48,10 +48,7 @@ namespace rtt_device { * \endcode * * \example device/test/gpu_hello_rt_api.cu - * Test of GPU_Device for CUDA runtime environment. - * - * \example device/test/gpu_hello_driver_api.cc - * Test of GPU_Device for CUDA driver environment. + * Test of GPU_Device for with CUDA runtime API. */ //===========================================================================// @@ -106,17 +103,6 @@ public: * textureAlign bytes do not need an offset applied to texture fetches */ int textureAlign(int devId = 0) const { return m_texturealign[devId]; } - //! Return the device handle - CUdevice deviceHandle(int idevice) const { - Require(idevice < deviceCount); - return device_handle[idevice]; - } - //! Return the context handle - CUcontext contextHandle(int idevice) const { - Require(idevice < deviceCount); - return context[idevice]; - } - // SERVICES //! Print a summary of idevice's features to ostream out. void printDeviceSummary(int const idevice, @@ -127,17 +113,17 @@ public: return (offset + alignment - 1) & ~(alignment - 1); } //! Check cuda return code and throw an Insist on error. - static std::string getErrorMessage(cudaError_enum const err); + static std::string getErrorMessage(cudaError_t const err); //! Check the value of the return code for CUDA calls. - static void checkForCudaError(cudaError_enum const errorCode); + static void checkForCudaError(cudaError_t const errorCode); //! Wrap the cuMemAlloc call to include error checking - static CUdeviceptr MemAlloc(unsigned const nbytes); + static void *MemAlloc(unsigned const nbytes); //! Wrap cuMemcpyHtoD() to include error checking. - static void MemcpyHtoD(CUdeviceptr ptr, void const *loc, unsigned nbytes); + static void MemcpyHtoD(void *ptr, void const *loc, unsigned nbytes); //! Wrap cuMemcpyDtoH() to include error checking. - static void MemcpyDtoH(void *loc, CUdeviceptr ptr, unsigned nbytes); + static void MemcpyDtoH(void *loc, void *ptr, unsigned nbytes); //! Wrap cuMemFree() to include error checking. - static void MemFree(CUdeviceptr ptr); + static void MemFree(void *ptr); protected: // IMPLEMENTATION @@ -163,16 +149,6 @@ private: std::vector m_regsperblock; std::vector m_clockrate; std::vector m_texturealign; - - //! Device handles (one per device) - std::vector device_handle; - /*! Device context (one per handle) - * - * Current implementation only allows 1 context per GPU. However, the CUDA - * Driver API provides for the concept of pushing and poping various contexts - * on the GPU. - */ - std::vector context; }; } // end namespace rtt_device diff --git a/src/device/GPU_Module.cc b/src/device/GPU_Module.cc deleted file mode 100644 index 2d53250ab6..0000000000 --- a/src/device/GPU_Module.cc +++ /dev/null @@ -1,101 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -/*! - * \file device/GPU_Module.cc - * \brief - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//---------------------------------------------------------------------------// - -#include "GPU_Module.hh" -#include "GPU_Device.hh" -#include "device/config.h" -#include "ds++/path.hh" -#include - -namespace rtt_device { - -//---------------------------------------------------------------------------// -// Constructor -//---------------------------------------------------------------------------// -/*! - * \brief Constructor - * \arg myPtxFile - the name of a ptx or cubin file. Ptx files will be - * compiled at runtime into cubins. Prefer the use of cubin to ruduce runtime. - * - * Create a GPU_Module object. - */ -GPU_Module::GPU_Module(std::string const &myPtxFile) - : ptxFile(findPtxFile(myPtxFile)) { - // load the module - cudaError_enum err = cuModuleLoad(&cuModule, ptxFile.c_str()); - rtt_device::GPU_Device::checkForCudaError(err); -} - -//---------------------------------------------------------------------------// -/*! - * \brief destructor - * - * Free the the loaded modules. - */ -GPU_Module::~GPU_Module() { - // Unload the module - cudaError_enum err = cuModuleUnload(cuModule); - rtt_device::GPU_Device::checkForCudaError(err); -} - -//---------------------------------------------------------------------------// -/*! - * \brief findPtxFile - * - * \param myPtxFile filename or path to the ptx file that is to be loaded. - * \return fully qualified path to the disired ptx file - */ -std::string GPU_Module::findPtxFile(std::string const &myPtxFile) { - Require(myPtxFile.length() > 0); - - // Location of GPU ptx files - read from config.h - std::string const testDir(rtt_device::test_kernel_bindir); - // return value - std::string ptxFile; - - // std::cout << "Looking at:\n" - // << myPtxFile << "\n" - // << testDir + std::string("/") + myPtxFile << std::endl; - - // Find the ptx file - if (rtt_dsxx::fileExists(myPtxFile)) - ptxFile = myPtxFile; - else if (rtt_dsxx::fileExists(std::string("../") + myPtxFile)) - ptxFile = std::string("../") + myPtxFile; - else if (rtt_dsxx::fileExists(testDir + std::string("/") + myPtxFile)) - ptxFile = testDir + std::string("/") + myPtxFile; - - Insist(rtt_dsxx::fileExists(ptxFile), - (std::string("Cannot find requested file: ") + myPtxFile).c_str()); - - return ptxFile; -} - -//---------------------------------------------------------------------------// -/*! - * \brief Find a function in the current module and return a handle. - * - * \param functionName the name of the CUDA function difined in the ptx - * module. - * \return a CUfunction handle that points to the requested function. - */ -CUfunction -GPU_Module::getModuleFunction(std::string const &functionName) const { - // Load the kernel from the module - CUfunction kernel; - cudaError_enum err = - cuModuleGetFunction(&kernel, cuModule, functionName.c_str()); - GPU_Device::checkForCudaError(err); - return kernel; -} - -} // end namespace rtt_device - -//---------------------------------------------------------------------------// -// end of GPU_Module.cc -//---------------------------------------------------------------------------// diff --git a/src/device/GPU_Module.hh b/src/device/GPU_Module.hh deleted file mode 100644 index ea03f13614..0000000000 --- a/src/device/GPU_Module.hh +++ /dev/null @@ -1,82 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -/*! - * \file device/GPU_Module.hh - * \author Kelly (KT) Thompson - * \brief Define class GPU_Module - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//---------------------------------------------------------------------------// - -#ifndef device_GPU_Module_hh -#define device_GPU_Module_hh - -#include "device_cuda.h" -#include - -namespace rtt_device { - -//===========================================================================// -/*! - * \class GPU_Module - * \brief - * - * CUDA modules are dynamically loadable packages of device code and data - * (akin to dynamic shared libraries) - * - * \sa GPU_Module.cc for detailed descriptions. - * - * \par Code Sample: - * \code - * cout << "Hello, world." << endl; - * \endcode - * - * \sa device/test/gpu_hello_driver_api.cc - */ -//===========================================================================// - -class GPU_Module { -public: - // NESTED CLASSES AND TYPEDEFS - - // CREATORS - - //! Default constructors. - GPU_Module(std::string const &myPtxFile); - - //! Copy constructor (the long doxygen description is in the .cc file). - // GPU_Module(const GPU_Module &rhs); - - //! Destructor. - ~GPU_Module(); - - // MANIPULATORS - - //! Assignment operator for GPU_Module. - // GPU_Module& operator=(const GPU_Module &rhs); - - // ACCESSORS - CUmodule handle(void) { return cuModule; } - - // SERVICES - CUfunction getModuleFunction(std::string const &functionName) const; - - // IMPLEMENTATION - static std::string findPtxFile(std::string const &myPtxFile); - -private: - // NESTED CLASSES AND TYPEDEFS - - // IMPLEMENTATION - - // DATA - std::string const ptxFile; - CUmodule cuModule; -}; - -} // end namespace rtt_device - -#endif // device_GPU_Module_hh - -//---------------------------------------------------------------------------// -// end of device/GPU_Module.hh -//---------------------------------------------------------------------------// diff --git a/src/device/config.h.in b/src/device/config.h.in index 198b0fce42..ed494b8868 100644 --- a/src/device/config.h.in +++ b/src/device/config.h.in @@ -12,6 +12,12 @@ #define device_config_h #include +#cmakedefine CUDA_DEVICE +#ifdef CUDA_DEVICE + #define HOST_AND_DEVICE_FUNCTION __host__ __device__ +#else + #define HOST_AND_DEVICE_FUNCTION +#endif namespace rtt_device { diff --git a/src/device/device_cuda.h b/src/device/device_cuda.h index 69ab29ee66..e42388c93b 100644 --- a/src/device/device_cuda.h +++ b/src/device/device_cuda.h @@ -27,6 +27,7 @@ #endif #include +#include #if defined __GNUC__ #pragma GCC system_header diff --git a/src/device/test/Dual_Call.cu b/src/device/test/Dual_Call.cu new file mode 100644 index 0000000000..8283aac641 --- /dev/null +++ b/src/device/test/Dual_Call.cu @@ -0,0 +1,112 @@ +///----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/Dual_Call.cu + * \author Alex R. Long + * \date Mon Mar 25 2019 + * \brief Show how code can be called from GPU and host + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. + */ +//---------------------------------------------------------------------------// + +#include "Dual_Call.hh" + +namespace rtt_device_test { + +//---------------------------------------------------------------------------// +/*! + * \brief Calculate the number of source particles for a range of cells + * + * \param[in] part_per_e energy for this source + * \param[in] max_particles_pspc max partices per species + * \param[in] cell_start starting cell index + * \param[in] cell_end ending cell index + * \param[in] e_field energy in a cell + * \param[in] src_cell_bias bias in a cell + * \param[in,out] n_field destination for particles in a cell + * \param[out] return number of particles over this cell range + */ +__host__ __device__ unsigned long long sub_conserve_calc_num_src_particles( + const double part_per_e, unsigned max_particles_pspc, + const size_t cell_start, const size_t cell_end, const double *e_field, + const double *src_cell_bias, int *n_field) { + unsigned long long ntot = 0; + + ntot = 0; + + // sweep through cells and calculate number of particles per cell + for (size_t cell = cell_start; cell < cell_end; cell++) { + // if the cell has any energy try to put some particles in it + if (e_field[cell] > 0.0) { + // get estimate of number of particles per cell to nearest + // integer per species, a cell-based bias can be added that simply + // multiplies the expected number by a user defined bias; the + // energy balance will still be correct because particles will + // simply be subtracted from other cells to compensate + const double d_num = e_field[cell] * part_per_e * src_cell_bias[cell]; + //Check(d_num > 0.0); + // Check( d_num < static_cast(max_particles_pspc) ); + + // We are about to cast d_num back to int. Ensure that the + // conversion is valid. If not, set the number of particles to + // the ceiling value provided in Source.hh. + if (d_num < static_cast(max_particles_pspc - 1)) { + n_field[cell] = static_cast(d_num + 0.5); + + // try to get at least one particle per cell per species + if (n_field[cell] == 0) + n_field[cell] = 1; + + } else { + n_field[cell] = max_particles_pspc; + } + + // increment particle counter (uint64_t += int) + ntot += n_field[cell]; + } else + n_field[cell] = 0; + } + return ntot; +} + +//---------------------------------------------------------------------------// +/*! + * \brief Launch a kernel to calculate the number of source particles + * + * \param[in] part_per_e energy for this source + * \param[in] max_particles_pspc max partices per species + * \param[in] cont_size size of all fields + * \param[in] e_field energy in a cell + * \param[in] src_cell_bias bias in a cell + * \param[in,out] n_field destination for particles in a cell + * \param[in,out] ntot total particles per thread block + */ +__global__ void cuda_conserve_calc_num_src_particles( + const double part_per_e, unsigned max_particles_pspc, int cont_size, + const double *e_field, const double *src_cell_bias, int *n_field, + unsigned long long *ntot) { + + __shared__ unsigned long long shared_data[512]; + size_t cell_start = threadIdx.x + blockIdx.x * blockDim.x; + size_t cell_end = cell_start + 1; + if (cell_start < cont_size) { + shared_data[threadIdx.x] = sub_conserve_calc_num_src_particles( + part_per_e, max_particles_pspc, cell_start, cell_end, e_field, + src_cell_bias, n_field); + } else + shared_data[threadIdx.x] = 0; + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; ++s) { + if (s == threadIdx.x) + shared_data[0] += shared_data[s]; + __syncthreads(); + } + __syncthreads(); + ntot[blockIdx.x] = shared_data[0]; +} + +} // namespace rtt_device_test + +//---------------------------------------------------------------------------// +// end of device/test/Dual_Call.cc +//---------------------------------------------------------------------------// diff --git a/src/device/test/Dual_Call.hh b/src/device/test/Dual_Call.hh new file mode 100644 index 0000000000..b2b4b97860 --- /dev/null +++ b/src/device/test/Dual_Call.hh @@ -0,0 +1,39 @@ +///----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/Dual_Call.hh + * \author Alex R. Long + * \date Mon Mar 25 2019 + * \brief Show how code can be called from GPU and host + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. + */ +//---------------------------------------------------------------------------// + +#ifndef rtt_device_test_Dual_Call_hh +#define rtt_device_test_Dual_Call_hh + +#include "device/config.h" +#include +#include +#include +#include + +namespace rtt_device_test { + +__host__ __device__ unsigned long long sub_conserve_calc_num_src_particles( + const double part_per_e, unsigned max_particles_pspc, + const size_t cell_start, const size_t cell_end, const double *e_field, + const double *src_cell_bias, int *n_field); + +__global__ void cuda_conserve_calc_num_src_particles( + const double part_per_e, unsigned max_particles_pspc, int cont_size, + const double *e_field, const double *src_cell_bias, int *n_field, + unsigned long long *ntot); + +} // namespace rtt_device_test + +#endif // rtt_device_test_Dual_Call_hh + +//---------------------------------------------------------------------------// +// end of device/test/Dual_Call.hh +//---------------------------------------------------------------------------// diff --git a/src/device/test/GPU_Device.cmake b/src/device/test/GPU_Device.cmake index 490ff11f4a..587a60a6fd 100644 --- a/src/device/test/GPU_Device.cmake +++ b/src/device/test/GPU_Device.cmake @@ -11,37 +11,42 @@ set( test_sources gpu_hello_rt_api.cu - # The low level Driver-API is not really supported by NVidia anymore. - # gpu_hello_driver_api.cc - ) + gpu_device_info.cu + gpu_dual_call_test.cu +) -# set( cuda_sources -# gpu_kernel.cu -# vector_add.cu -# ) +set(cuda_headers + basic_kernels.hh + Dual_Call.hh +) + +set(cuda_sources + basic_kernels.cu + Dual_Call.cu +) # ---------------------------------------------------------------------------- # # Build Unit tests # ---------------------------------------------------------------------------- # # Stuff cuda code into a test library. -# add_library( Lib_device_test SHARED ${cuda_sources} ) - -#add_executable( Ut_gpu_hello_driver_api_exe gpu_hello_driver_api.cc ) -#target_link_libraries( Ut_gpu_hello_driver_api_exe Lib_device ) - -add_executable( Ut_gpu_hello_rt_api_exe gpu_hello_rt_api.cu ) -target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_dsxx ) -# target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_device_test Lib_dsxx ) -#target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_device ) +add_component_library( + TARGET Lib_device_test + TARGET_DEPS Lib_dsxx Lib_device + LIBRARY_NAME device_test + LIBRARY_TYPE STATIC + SOURCES "${cuda_sources}" + HEADERS "${cuda_headers}" ) +set_property(TARGET Lib_device_test PROPERTY CUDA_SEPARABLE_COMPILATION ON) # We need to explicitly state that we need all CUDA files in the particle # library to be built with -dc as the member functions could be called by # other libraries and executables -set_target_properties( Ut_gpu_hello_rt_api_exe PROPERTIES - CUDA_SEPARABLE_COMPILATION ON ) -target_include_directories( Ut_gpu_hello_rt_api_exe - PRIVATE $ ) +set_target_properties( Lib_device_test PROPERTIES + CUDA_SEPARABLE_COMPILATION ON ) + +#target_include_directories( Ut_gpu_hello_rt_api_exe +# PRIVATE $ ) # $ ) # PUBLIC $ ) @@ -54,16 +59,17 @@ target_include_directories( Ut_gpu_hello_rt_api_exe # Register Unit tests # ---------------------------------------------------------------------------- # -# set( test_deps -# Lib_device -# ${CUDA_CUDA_LIBRARY} ) -# add_scalar_tests( -# SOURCES "${test_sources}" -# DEPS "${test_deps}" -# ) + set( test_deps + Lib_device + Lib_device_test) + + add_scalar_tests( + SOURCES "${test_sources}" + DEPS "${test_deps}" + ) -add_test( NAME device_gpu_hello_rt_api - COMMAND $ ) +#add_test( NAME device_gpu_hello_rt_api +# COMMAND $ ) set_tests_properties( device_gpu_hello_rt_api PROPERTIES PASS_REGULAR_EXPRESSION ".*[Tt]est: PASSED" FAIL_REGULAR_EXPRESSION ".*[Tt]est: FAILED" ) diff --git a/src/device/test/basic_kernels.cu b/src/device/test/basic_kernels.cu new file mode 100644 index 0000000000..68fe0966a8 --- /dev/null +++ b/src/device/test/basic_kernels.cu @@ -0,0 +1,48 @@ +//----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/basic_kernels.cu + * \author Kelly Thompson + * \date + * \brief Small kernel code for testing GPU Device framework. + * \note Copyright (C) 2016-2019 Triad National Security, LLC. + * All rights reserved. */ +//---------------------------------------------------------------------------// + +namespace rtt_device_test { + +//---------------------------------------------------------------------------// +/*! + * \brief CUDA kernel for adding two numbers + * + * \param[in,out] dest location to store sum + * \param[in] a value to add + * \param{in] b value to add + */ +__global__ void sum(int *dest, int a, int b) { + // Assuming a single thread, 1x1x1 block, 1x1 grid + *dest = a + b; +} + +//---------------------------------------------------------------------------// +/*! + * \brief CUDA kernel for adding two vectors + * + * \param[in] A_dev vector to add + * \param[in] B_dev vector to add + * \param{in,out] C_dev location to store solution vector + * \param{in] N length of vectors + */ +__global__ void vector_add(double const *A_dev, double const *B_dev, + double *C_dev, int const N) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + /* if(i%512==0) + * printf("index %d\n",i); */ + if (i < N) + C_dev[i] = A_dev[i] + B_dev[i]; +} + +} // namespace rtt_device_test + +//---------------------------------------------------------------------------// +// end of basic_kernels.cu +//---------------------------------------------------------------------------// diff --git a/src/device/test/basic_kernels.hh b/src/device/test/basic_kernels.hh new file mode 100644 index 0000000000..7f6996aad3 --- /dev/null +++ b/src/device/test/basic_kernels.hh @@ -0,0 +1,34 @@ +///----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/basic_kernels.hh + * \author Alex R. Long + * \date Mon Mar 25 2019 + * \brief Simple kernels for basic GPU tests + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. + */ +//---------------------------------------------------------------------------// + +#ifndef rtt_device_test_basic_kernels_hh +#define rtt_device_test_basic_kernels_hh + +#include "device/config.h" +#include +#include +#include +#include + +namespace rtt_device_test { + +__global__ void vector_add(double const *A_dev, double const *B_dev, + double *C_dev, int const N); + +__global__ void sum(int *dest, int a, int b); + +} // namespace rtt_device_test + +#endif // rtt_device_test_basic_kernels_hh + +//---------------------------------------------------------------------------// +// end of device/test/basic_kernels.hh +//---------------------------------------------------------------------------// diff --git a/src/device/test/gpu_device_info.cu b/src/device/test/gpu_device_info.cu new file mode 100644 index 0000000000..712b0ada6b --- /dev/null +++ b/src/device/test/gpu_device_info.cu @@ -0,0 +1,77 @@ +//----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/gpu_device_info.cc + * \author Alex Long + * \date Thu Mar 21 15:28:48 2019 + * \brief Simple test of the CUDA Runtime API through the GPU_Device object + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. */ +//---------------------------------------------------------------------------// + +#include "device/GPU_Device.hh" +#include "device/config.h" +#include "ds++/Assert.hh" +#include "ds++/DracoStrings.hh" +#include "ds++/Release.hh" +#include "ds++/ScalarUnitTest.hh" +#include "ds++/Soft_Equivalence.hh" +#include "ds++/SystemCall.hh" +#include "ds++/path.hh" +#include +#include +#include + +//---------------------------------------------------------------------------// +// query_device +//---------------------------------------------------------------------------// + +void query_device(rtt_dsxx::ScalarUnitTest &ut) { + using namespace std; + + cout << "Starting gpu_hello_driver_api::query_device()...\n" << endl; + + // Create a GPU_Device object. + // Initialize the CUDA library and sets device and context handles. + rtt_device::GPU_Device gpu; + + // Create and then print a summary of the devices found. + std::ostringstream out; + size_t const numDev(gpu.numDevicesAvailable()); + out << "GPU device summary:\n\n" + << " Number of devices found: " << numDev << "\n" + << endl; + for (size_t device = 0; device < numDev; ++device) + gpu.printDeviceSummary(device, out); + + // Print the message to stdout + cout << out.str(); + + // Parse the output + bool verbose(false); + std::map wordCount = + rtt_dsxx::get_word_count(out, verbose); + + FAIL_IF_NOT(wordCount[string("Device")] == numDev); + // successful test output + if (ut.numFails == 0) + PASSMSG("gpu_device_info_test query_device test OK."); + return; +} + +//---------------------------------------------------------------------------// +// Main +//---------------------------------------------------------------------------// + +int main(int argc, char *argv[]) { + using namespace std; + + rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release); + try { + query_device(ut); + } + UT_EPILOG(ut); +} + +//---------------------------------------------------------------------------// +// end of gpu_device_info.cc +//---------------------------------------------------------------------------// diff --git a/src/device/test/gpu_dual_call_test.cu b/src/device/test/gpu_dual_call_test.cu new file mode 100644 index 0000000000..cf91f9070e --- /dev/null +++ b/src/device/test/gpu_dual_call_test.cu @@ -0,0 +1,130 @@ +///----------------------------------*-C++-*----------------------------------// +/*! + * \file device/test/gpu_dual_call_test.cc + * \author Alex R. Long + * \date Mon Mar 25 2019 + * \brief Show how code can be called from GPU and host + * \note Copyright (C) 2019 Triad National Security, LLC. + * All rights reserved. + */ +//---------------------------------------------------------------------------// + +#include "device/GPU_Device.hh" +#include "device/test/Dual_Call.hh" +#include "ds++/Release.hh" +#include "ds++/ScalarUnitTest.hh" + +#include +#include +#include +#include + +using std::cout; +using std::endl; +using std::string; +using std::vector; +using namespace rtt_device; +using namespace rtt_device_test; + +int dual_call_test(rtt_dsxx::ScalarUnitTest &ut) { + + int n_cells = 1029; + vector src_cell_bias(n_cells, 1.0); + vector e_field(n_cells, 1.0); + vector n_field(n_cells, 0); + unsigned long long device_n_tot = 0; + unsigned long long host_n_tot = 0; + + const double part_per_e = 1.0; + const unsigned max_particles_pspc = 100; + + constexpr int threads_per_block = 512; + int n_blocks = (n_cells + threads_per_block - 1) / threads_per_block; + // setup and copy all fields + vector n_tot_block(n_blocks, 0); + int *D_n_field = NULL; + double *D_e_field = NULL; + double *D_src_cell_bias = NULL; + unsigned long long *D_n_tot = NULL; + rtt_device::GPU_Device gpu; + cudaError_t err = cudaMalloc((void **)&D_n_field, n_cells * sizeof(int)); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMalloc((void **)&D_e_field, n_cells * sizeof(double)); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMalloc((void **)&D_src_cell_bias, n_cells * sizeof(double)); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMalloc((void **)&D_n_tot, n_blocks * sizeof(unsigned long long)); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMemcpy(D_n_field, &n_field[0], n_cells * sizeof(int), + cudaMemcpyHostToDevice); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMemcpy(D_e_field, &e_field[0], n_cells * sizeof(double), + cudaMemcpyHostToDevice); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + err = cudaMemcpy(D_src_cell_bias, &src_cell_bias[0], n_cells * sizeof(double), + cudaMemcpyHostToDevice); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + + dim3 blockSize; + dim3 gridSize; + blockSize.x = n_blocks; + blockSize.y = 1; + blockSize.z = 1; + gridSize.x = threads_per_block; + gridSize.y = 1; + gridSize.z = 1; + cuda_conserve_calc_num_src_particles<<>>( + part_per_e, max_particles_pspc, n_cells, D_e_field, D_src_cell_bias, + D_n_field, D_n_tot); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + cudaDeviceSynchronize(); + std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl; + + vector e_field_out(n_cells, 0.0); + err = cudaMemcpy(&n_field[0], D_n_field, n_cells * sizeof(int), + cudaMemcpyDeviceToHost); + std::cout << gpu.getErrorMessage(err) << std::endl; + + err = + cudaMemcpy(&n_tot_block[0], D_n_tot, + n_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost); + std::cout << gpu.getErrorMessage(err) << std::endl; + + err = cudaDeviceReset(); + std::cout << gpu.getErrorMessage(err) << std::endl; + + cudaFree(D_n_field); + cudaFree(D_e_field); + cudaFree(D_src_cell_bias); + cudaFree(D_n_tot); + // reduce the n_tot over all thread blocks + for (int i = 0; i < n_blocks; ++i) + device_n_tot += n_tot_block[i]; + + cout << "N total: " << device_n_tot << endl; + + host_n_tot = sub_conserve_calc_num_src_particles( + part_per_e, max_particles_pspc, 0, n_cells, &e_field[0], + &src_cell_bias[0], &n_field[0]); + + cout << "Host N total: " << host_n_tot << endl; + if (host_n_tot != device_n_tot) + FAILMSG(string("Host and device totals don't match!")); + else + PASSMSG("Host and device calls agree on value."); + + return 0; +} + +//----------------------------------------------------------------------------// +int main(int argc, char *argv[]) { + rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release); + try { + dual_call_test(ut); + } + UT_EPILOG(ut); +} + +//---------------------------------------------------------------------------// +// end of device/Dual_Call.cc +//---------------------------------------------------------------------------// diff --git a/src/device/test/gpu_hello_driver_api.cc b/src/device/test/gpu_hello_driver_api.cc deleted file mode 100644 index 0edc1401d9..0000000000 --- a/src/device/test/gpu_hello_driver_api.cc +++ /dev/null @@ -1,333 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -/*! - * \file device/test/gpu_hello_driver_api.cc - * \author Kelly (KT) Thompson - * \date Thu Oct 25 15:28:48 2011 - * \brief Simple test of the CUDA Driver API. - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//---------------------------------------------------------------------------// - -#include "device/GPU_Device.hh" -#include "device/GPU_Module.hh" -#include "device/config.h" -#include "ds++/Assert.hh" -#include "ds++/DracoStrings.hh" -#include "ds++/Release.hh" -#include "ds++/ScalarUnitTest.hh" -#include "ds++/Soft_Equivalence.hh" -#include "ds++/SystemCall.hh" -#include "ds++/path.hh" -#include // RAND_MAX -#include -#include -#include -#include - -//---------------------------------------------------------------------------// -// Helpers -//---------------------------------------------------------------------------// - -void genTestData(std::vector &a, std::vector &b, - std::vector &ref) { - // Initialize the random seed - srand(time(NULL)); - - // Fill arrays - for (size_t i = 0; i < a.size(); ++i) { - a[i] = static_cast(rand() % 1000); - b[i] = static_cast(rand() % 1000); - ref[i] = a[i] + b[i]; - } - - return; -} - -//---------------------------------------------------------------------------// -// query_device -//---------------------------------------------------------------------------// - -void query_device(rtt_dsxx::ScalarUnitTest &ut) { - using namespace std; - - cout << "Starting gpu_hello_driver_api::query_device()...\n" << endl; - - // Create a GPU_Device object. - // Initialize the CUDA library and sets device and context handles. - rtt_device::GPU_Device gpu; - - // Create and then print a summary of the devices found. - std::ostringstream out; - size_t const numDev(gpu.numDevicesAvailable()); - out << "GPU device summary:\n\n" - << " Number of devices found: " << numDev << "\n" - << endl; - for (size_t device = 0; device < numDev; ++device) - gpu.printDeviceSummary(device, out); - - // Print the message to stdout - cout << out.str(); - - // Parse the output - bool verbose(false); - std::map wordCount = - rtt_dsxx::get_word_count(out, verbose); - - if (wordCount[string("Device")] == numDev) - ut.passes("Found a report for each available device."); - else - ut.failure("Did not find a report for each available device."); - - return; -} - -//---------------------------------------------------------------------------// -// Test: simple_add -//---------------------------------------------------------------------------// - -void simple_add(rtt_dsxx::ScalarUnitTest &ut) { - using namespace std; - - cout << "\nStarting gpu_hello_driver_api::simple_add()...\n" << endl; - - // Where are we? - cout << "Paths:" - << "\n Current working dir = " << rtt_dsxx::draco_getcwd() - << "\n GPU kernel files at = " << rtt_device::test_kernel_bindir - << endl; - - // Create a GPU_Device object. - // Initialize the CUDA library and sets device and context handles. - rtt_device::GPU_Device gpu; - - // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu - rtt_device::GPU_Module myModule("gpu_kernel.cubin"); - - // Load the kernel from the module - cout << "Load kernel \"sum\" from the module." << endl; - CUfunction kernel; - cudaError_enum err = cuModuleGetFunction(&kernel, myModule.handle(), "sum"); - gpu.checkForCudaError(err); - - // Allocate some memory for the result - cout << "Allocate memory on the device." << endl; - CUdeviceptr dest; - err = cuMemAlloc(&dest, sizeof(int)); - gpu.checkForCudaError(err); - - // Setup kernel parameters - int offset(0); - offset = gpu.align(offset, __alignof(CUdeviceptr)); - - // cuParamSetv is used for pointers... - err = cuParamSetv(kernel, offset, &dest, sizeof(CUdeviceptr)); - gpu.checkForCudaError(err); - offset += sizeof(CUdeviceptr); - - offset = gpu.align(offset, __alignof(int)); - err = cuParamSeti(kernel, offset, 4); // cuParamSeti is used for integers. - gpu.checkForCudaError(err); - offset += sizeof(int); - offset = gpu.align(offset, __alignof(int)); - err = cuParamSeti(kernel, offset, 34); - gpu.checkForCudaError(err); - offset += sizeof(int); - err = cuParamSetSize(kernel, offset); - gpu.checkForCudaError(err); - - // Launch the grid - cout << "Launch the grid" << endl; - err = cuFuncSetBlockShape(kernel, 1, 1, 1); - gpu.checkForCudaError(err); - err = cuLaunchGrid(kernel, 1, 1); - gpu.checkForCudaError(err); - - // Read the result off of the GPU - cout << "Read the result" << endl; - int result = 0; - err = cuMemcpyDtoH(&result, dest, sizeof(int)); - gpu.checkForCudaError(err); - - cout << "Sum of 4 and 34 is " << result << endl; - - if (result == 38) - ut.passes("Sum of 4 and 34 is 38."); - else - ut.failure("Sum of 4 and 34 was incorrect."); - - // deallocate memory, free the context. - cout << "deallocate device memory." << endl; - err = cuMemFree(dest); - gpu.checkForCudaError(err); - - return; -} - -//---------------------------------------------------------------------------// -// vector_add -//---------------------------------------------------------------------------// - -void vector_add(rtt_dsxx::ScalarUnitTest &ut) { - using namespace std; - - cout << "\nStarting gpu_hello_driver_api::vector_add()...\n" << endl; - - // Create a GPU_Device object. - // Initialize the CUDA library and sets device and context handles. - rtt_device::GPU_Device gpu; - - // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu - rtt_device::GPU_Module myModule("vector_add.cubin"); - - // Host data - size_t len(1024); - size_t const threadsPerBlock(gpu.maxThreadsPerBlock()); - size_t const blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock; - vector aH(len); - vector bH(len); - vector cH(len, 0.0); - vector refH(len); - genTestData(aH, bH, refH); - - // Load the kernel from the module - CUfunction kernel; - cudaError_enum err = - cuModuleGetFunction(&kernel, myModule.handle(), "vector_add"); - gpu.checkForCudaError(err); - - // Allocate some memory for the result - CUdeviceptr d_A, d_B, d_C; - err = cuMemAlloc(&d_A, len * sizeof(double)); - gpu.checkForCudaError(err); - err = cuMemAlloc(&d_B, len * sizeof(double)); - gpu.checkForCudaError(err); - err = cuMemAlloc(&d_C, len * sizeof(double)); - gpu.checkForCudaError(err); - - // Copy host data to device - err = cuMemcpyHtoD(d_A, &aH[0], len * sizeof(double)); - gpu.checkForCudaError(err); - err = cuMemcpyHtoD(d_B, &bH[0], len * sizeof(double)); - gpu.checkForCudaError(err); - - // This is the function signature - void *args[] = {&d_A, &d_B, &d_C, &len}; - - // Execute the kernel - err = cuLaunchKernel(kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, - args, 0); - gpu.checkForCudaError(err); - - // Copy result from device to host - err = cuMemcpyDtoH((void *)(&cH[0]), d_C, len * sizeof(double)); - gpu.checkForCudaError(err); - - // Free device memory - err = cuMemFree(d_A); - gpu.checkForCudaError(err); - err = cuMemFree(d_B); - gpu.checkForCudaError(err); - err = cuMemFree(d_C); - gpu.checkForCudaError(err); - - // Check the result - if (rtt_dsxx::soft_equiv(cH.begin(), cH.end(), refH.begin(), refH.end())) - ut.passes("vector_add worked!"); - else - ut.failure("vector_add failed."); - - return; -} - -//---------------------------------------------------------------------------// -// vector_add_using_wrappers -//---------------------------------------------------------------------------// - -void vector_add_using_wrappers(rtt_dsxx::ScalarUnitTest &ut) { - using namespace std; - - cout << "\nStarting gpu_hello_driver_api::vector_add_using_wrappers()...\n" - << endl; - - // Create a GPU_Device object. - // Initialize the CUDA library and sets device and context handles. - rtt_device::GPU_Device gpu; - - // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu - rtt_device::GPU_Module myModule("vector_add.cubin"); - - // Host data - size_t len(1024); - size_t const threadsPerBlock(gpu.maxThreadsPerBlock()); - size_t const blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock; - vector aH(len); - vector bH(len); - vector cH(len, 0.0); - vector refH(len); - genTestData(aH, bH, refH); - - // Load the kernel from the module - CUfunction kernel = myModule.getModuleFunction("vector_add"); - - // Allocate some memory for the result - unsigned const nbytes = len * sizeof(double); - CUdeviceptr d_A = gpu.MemAlloc(nbytes); - CUdeviceptr d_B = gpu.MemAlloc(nbytes); - CUdeviceptr d_C = gpu.MemAlloc(nbytes); - - // Copy host data to device - gpu.MemcpyHtoD(d_A, &aH[0], nbytes); - gpu.MemcpyHtoD(d_B, &bH[0], nbytes); - - // This is the function signature - void *args[] = {&d_A, &d_B, &d_C, &len}; - - // Execute the kernel - cudaError_enum err = cuLaunchKernel(kernel, blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, 0, 0, args, 0); - gpu.checkForCudaError(err); - - // Copy result from device to host - gpu.MemcpyDtoH((void *)(&cH[0]), d_C, nbytes); - - // Free device memory - gpu.MemFree(d_A); - gpu.MemFree(d_B); - gpu.MemFree(d_C); - - // Check the result - if (rtt_dsxx::soft_equiv(cH.begin(), cH.end(), refH.begin(), refH.end())) - ut.passes("vector_add worked!"); - else - ut.failure("vector_add failed."); - - return; -} - -//---------------------------------------------------------------------------// -// Main -//---------------------------------------------------------------------------// - -int main(int argc, char *argv[]) { - using namespace std; - - rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release); - try { - query_device(ut); - simple_add(ut); - vector_add(ut); - vector_add_using_wrappers(ut); - } catch (exception &err) { - cout << "ERROR: While testing gpu_hello_driver_api, " << err.what() << endl; - ut.numFails++; - } catch (...) { - cout << "ERROR: While testing gpu_hello_driver_api, " - << "An unknown exception was thrown." << endl; - ut.numFails++; - } - return ut.numFails; -} - -//---------------------------------------------------------------------------// -// end of gpu_hello_driver_api.cc -//---------------------------------------------------------------------------// diff --git a/src/device/test/gpu_hello_rt_api.cu b/src/device/test/gpu_hello_rt_api.cu index c373cc554b..81343c8e1f 100644 --- a/src/device/test/gpu_hello_rt_api.cu +++ b/src/device/test/gpu_hello_rt_api.cu @@ -17,7 +17,9 @@ #include // GPU kernels -#include "device/test/vector_add.cu" +#include "device/test/basic_kernels.hh" + +using namespace rtt_device_test; //----------------------------------------------------------------------------// // Tests diff --git a/src/device/test/gpu_hello_rt_api.hh b/src/device/test/gpu_hello_rt_api.hh deleted file mode 100644 index bee51c406a..0000000000 --- a/src/device/test/gpu_hello_rt_api.hh +++ /dev/null @@ -1,43 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -/*! - * \file device/test/gpu_hello_rt_api.hh - * \author Kelly (KT) Thompson - * \date Thu Oct 25 15:28:48 2011 - * \brief Wrap the cuda_runtime_api.h header while preventing comiler - * warnings about vendor code. - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//---------------------------------------------------------------------------// - -// All this garbage suppresses warnings found in "cuda.h". -// http://wiki.services.openoffice.org/wiki/Writing_warning-free_code#When_all_else_fails -#if defined __GNUC__ -#pragma GCC system_header -// Intel defines __GNUC__ by default -#ifdef __INTEL_COMPILER -#pragma warning push -#endif -#elif defined __SUNPRO_CC -#pragma disable_warn -#elif defined _MSC_VER -#pragma warning(push, 1) -#endif - -#include - -#if defined __GNUC__ -#pragma GCC system_header -#ifdef __INTEL_COMPILER -#pragma warning pop -#endif -#elif defined __SUNPRO_CC -#pragma enable_warn -#elif defined _MSC_VER -#pragma warning(pop) -#endif - -// #define Error(format, args...) (error_and_exit)("%s:%d: " format, __FILE__, __LINE__, ##args) - -//---------------------------------------------------------------------------// -// end of gpu_hello_rt_api.hh -//---------------------------------------------------------------------------// diff --git a/src/device/test/gpu_kernel.cu b/src/device/test/gpu_kernel.cu deleted file mode 100644 index 7793b837a3..0000000000 --- a/src/device/test/gpu_kernel.cu +++ /dev/null @@ -1,20 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -/*! - * \file device/test/gpu_kernel.cu - * \author Kelly Thompson - * \date - * \brief Small kernel code for testing GPU Device framework. - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//---------------------------------------------------------------------------// - -extern "C" { -__global__ void sum(int *dest, int a, int b) { - // Assuming a single thread, 1x1x1 block, 1x1 grid - *dest = a + b; -} -} - -//---------------------------------------------------------------------------// -// end of gpu_kernel.cu -//---------------------------------------------------------------------------// diff --git a/src/device/test/vector_add.cu b/src/device/test/vector_add.cu deleted file mode 100644 index 477e2e4c4f..0000000000 --- a/src/device/test/vector_add.cu +++ /dev/null @@ -1,22 +0,0 @@ -//-----------------------------------*-C++-*----------------------------------// -/*! - * \file device/test/vector_add.cu - * \author Kelly Thompson - * \date - * \brief Small kernel code for testing GPU Device framework. - * \note Copyright (C) 2016-2019 Triad National Security, LLC. - * All rights reserved. */ -//----------------------------------------------------------------------------// - -__global__ void vector_add(double const *A_dev, double const *B_dev, - double *C_dev, int const N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - /* if(i%512==0) - * printf("index %d\n",i); */ - if (i < N) - C_dev[i] = A_dev[i] + B_dev[i]; -} - -//----------------------------------------------------------------------------// -// end of vector_add.cu -//----------------------------------------------------------------------------//