diff --git a/config/component_macros.cmake b/config/component_macros.cmake
index b5cb179985..0291fd4f49 100644
--- a/config/component_macros.cmake
+++ b/config/component_macros.cmake
@@ -253,6 +253,7 @@ endmacro( add_component_executable )
 #   SOURCES      "file1.cc;file2.cc;..."
 #   HEADERS      "file1.hh;file2.hh;..."
 #   LIBRARY_NAME_PREFIX "rtt_"
+#   LIBRARY_TYPE "SHARED"
 #   VENDOR_LIST  "MPI;GSL"
 #   VENDOR_LIBS  "${MPI_CXX_LIBRARIES};${GSL_LIBRARIES}"
 #   VENDOR_INCLUDE_DIRS "${MPI_CXX_INCLUDE_DIR};${GSL_INCLUDE_DIR}"
@@ -279,7 +280,7 @@ macro( add_component_library )
   cmake_parse_arguments(
     acl
     "NOEXPORT"
-    "PREFIX;TARGET;LIBRARY_NAME;LIBRARY_NAME_PREFIX;LINK_LANGUAGE"
+    "PREFIX;TARGET;LIBRARY_NAME;LIBRARY_NAME_PREFIX;LIBRARY_TYPE;LINK_LANGUAGE"
     "HEADERS;SOURCES;TARGET_DEPS;VENDOR_LIST;VENDOR_LIBS;VENDOR_INCLUDE_DIRS"
     ${ARGV}
     )
@@ -305,6 +306,11 @@ macro( add_component_library )
     endif()
   endif()
 
+  # if a library type was not specified use the default Draco setting
+  if(NOT acl_LIBRARY_TYPE)
+    set( acl_LIBRARY_TYPE ${DRACO_LIBRARY_TYPE})
+  endif()
+
   #
   # Create the library and set the properties
   #
@@ -314,7 +320,7 @@ macro( add_component_library )
   # extract project name, minus leading "Lib_"
   string( REPLACE "Lib_" "" folder_name ${acl_TARGET} )
 
-  add_library( ${acl_TARGET} ${DRACO_LIBRARY_TYPE} ${acl_SOURCES} )
+  add_library( ${acl_TARGET} ${acl_LIBRARY_TYPE} ${acl_SOURCES} )
   # Some properties are set at a global scope in compilerEnv.cmake:
   # - C_STANDARD, C_EXTENSIONS, CXX_STANDARD, CXX_EXTENSIONS,
   #   CXX_STANDARD_REQUIRED, and POSITION_INDEPENDENT_CODE
diff --git a/config/query_gpu.cu b/config/query_gpu.cu
new file mode 100644
index 0000000000..34d64d01d3
--- /dev/null
+++ b/config/query_gpu.cu
@@ -0,0 +1,38 @@
+//----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   config/query_gpu.cu
+ * \author Alex Long
+ * \brief  Small CUDA code that prints the architecture version, used by CMake
+ * \date   Thu Mat 21 15:53:51 2019
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved. */
+//---------------------------------------------------------------------------//
+
+// NOTE: This code is from
+// wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake
+
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  cudaDeviceProp dP;
+  float min_cc = 3.0;
+
+  int rc = cudaGetDeviceProperties(&dP, 0);
+  if (rc != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    printf("CUDA error: %s", cudaGetErrorString(error));
+    return rc; /* Failure */
+  }
+  if ((dP.major + (dP.minor / 10)) < min_cc) {
+    printf("Min Compute Capability of %2.1f required:  %d.%d found\n Not "
+           "Building CUDA Code",
+           min_cc, dP.major, dP.minor);
+    return 1; /* Failure */
+  } else {
+    printf("-arch=sm_%d%d", dP.major, dP.minor);
+    return 0; /* Success */
+  }
+}
+//---------------------------------------------------------------------------//
+//  end of query_gpu.cu
+//---------------------------------------------------------------------------//
diff --git a/config/vendor_libraries.cmake b/config/vendor_libraries.cmake
index 285ad5d450..60a6af72e1 100644
--- a/config/vendor_libraries.cmake
+++ b/config/vendor_libraries.cmake
@@ -295,36 +295,29 @@ macro( setupCudaEnv )
 
   add_feature_info( Cuda WITH_CUDA "Build CUDA kernels for GPU compute.")
 
-  if( WITH_CUDA )
+  if( WITH_CUDA AND NOT DEFINED CUDA_DBS_STRING )
     set( CUDA_DBS_STRING "CUDA" CACHE BOOL
       "If CUDA is available, this variable is 'CUDA'")
 
-    # message("
-    # CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-    # CMAKE_CUDA_HOST_COMPILER       = ${CMAKE_CUDA_HOST_COMPILER}
-    # CMAKE_GENERATOR_TOOLSET        = ${CMAKE_GENERATOR_TOOLSET}
-    # CMAKE_VS_PLATFORM_TOOLSET_CUDA = ${CMAKE_VS_PLATFORM_TOOLSET_CUDA}
-    # CUDA_EXTENSIONS = ${CUDA_EXTENSIONS}
-    # CUDAHOSTCXX     = ${CUDAHOSTCXX}
-    # CUDAFLAGS       = ${CUDAFLAGS}
-    # CUDACXX         = ${CUDACXX}
-    # CUDA_STANDARD   = ${CUDA_STANDARD}
-    # CUDA_SEPARABLE_COMPILATION  = ${CUDA_SEPARABLE_COMPILATION}
-    # CUDA_RESOLVE_DEVICE_SYMBOLS = ${CUDA_RESOLVE_DEVICE_SYMBOLS}
-    # CUDA_PTX_COMPILATION        = ${CUDA_PTX_COMPILATION}
-    # ")
-
-    # $ENV{CUDACXX}
-    # $ENV{CUDAFLAGS}
-    # $ENV{CUDAHOSTCXX}
-
-    # target properties
-    # - CUDA_EXTENSIONS
-    # - CUDA_PTX_COMPILATION
-    # - CUDA_RESOLVE_DEVICE_SYMBOLS
-    # - CUDA_SEPARABLE_COMPILATION
-    # - CUDA_STANDARD
-    # - CUDA_STANDARD_REQUIRED
+    set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/config/cuda_script) # No suffix required
+    set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/config/query_gpu.cu)
+    execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
+    execute_process(COMMAND ${OUTPUTFILE}
+                    RESULT_VARIABLE CUDA_RETURN_CODE OUTPUT_VARIABLE ARCH)
+
+    if (${CUDA_RETURN_CODE EQUAL 0})
+      message(STATUS "CUDA Architecture: ${ARCH}")
+      set(CMAKE_CUDA_FLAGS "${ARCH} -g -G" CACHE STRING
+      set(CMAKE_CUDA_FLAGS_DEBUG "-O0" CACHE STRING
+        "CUDA debug flags" FORCE)
+        "CUDA debug flags" FORCE)
+      set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 --generate-line-info" CACHE STRING
+        "CUDA release with debug information flags" FORCE)
+      set(CMAKE_CUDA_FLAGS_RELEASE "-O2" CACHE STRING
+        "CUDA release flags" FORCE)
+    else()
+      message(WARNING ${ARCH})
+    endif()
   endif()
 
 endmacro()
diff --git a/src/device/GPU_Device.cc b/src/device/GPU_Device.cc
index bc2c1e82e5..2670a1b1cd 100644
--- a/src/device/GPU_Device.cc
+++ b/src/device/GPU_Device.cc
@@ -9,6 +9,7 @@
 //---------------------------------------------------------------------------//
 
 #include "GPU_Device.hh"
+#include <iostream>
 #include <sstream>
 
 namespace rtt_device {
@@ -26,12 +27,9 @@ namespace rtt_device {
  */
 GPU_Device::GPU_Device(void)
     : deviceCount(0), computeCapability(), deviceName() {
-  // Initialize the library
-  cudaError_enum err = cuInit(0); // currently must be 0.
-  checkForCudaError(err);
 
   // Get a device count, determine compute capability
-  err = cuDeviceGetCount(&deviceCount);
+  cudaError_t err = cudaGetDeviceCount(&deviceCount);
   checkForCudaError(err);
   Insist(deviceCount > 0, "No GPU devices found!");
 
@@ -39,130 +37,65 @@ GPU_Device::GPU_Device(void)
   computeCapability.resize(deviceCount);
 
   for (int device = 0; device < deviceCount; device++) {
-    CUdevice cuDevice;
-    err = cuDeviceGet(&cuDevice, device);
+    int cudaDevice;
+    err = cudaSetDevice(device);
+    checkForCudaError(err);
+    err = cudaGetDevice(&cudaDevice);
     checkForCudaError(err);
+    Check(cudaDevice == device);
 
     // Compute capability revision
     int major = 0;
     int minor = 0;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                         cuDevice);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                         cuDevice);
+    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device);
+    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device);
     computeCapability[device].push_back(major);
     computeCapability[device].push_back(minor);
 
     // Device name
-    char name[200];
-    err = cuDeviceGetName(name, 200, cuDevice);
+    cudaDeviceProp device_properties;
+    err = cudaGetDeviceProperties(&device_properties, device);
     checkForCudaError(err);
-    deviceName.push_back(std::string(name));
+    deviceName.push_back(std::string(device_properties.name));
 
     // Query and archive device properties.
     {
-      int tmp1(0), tmp2(0), tmp3(0);
-      err = cuDeviceGetAttribute(
-          &tmp1, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuDevice);
-      checkForCudaError(err);
-      m_maxthreadsperblock.push_back(tmp1);
-
-      err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                                 cuDevice);
-      checkForCudaError(err);
-      err = cuDeviceGetAttribute(&tmp2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                                 cuDevice);
-      checkForCudaError(err);
-      err = cuDeviceGetAttribute(&tmp3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                                 cuDevice);
-      checkForCudaError(err);
-      m_maxthreadsdim.push_back(std::array<int, 3>{tmp1, tmp2, tmp3});
-
-      err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                                 cuDevice);
-      checkForCudaError(err);
-      err = cuDeviceGetAttribute(&tmp2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                                 cuDevice);
-      checkForCudaError(err);
-      err = cuDeviceGetAttribute(&tmp3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                                 cuDevice);
-      checkForCudaError(err);
-      m_maxgridsize.push_back(std::array<int, 3>{tmp1, tmp2, tmp3});
-
-      err = cuDeviceGetAttribute(
-          &tmp1, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cuDevice);
-      checkForCudaError(err);
-      m_sharedmemperblock.push_back(tmp1);
-
-      err = cuDeviceGetAttribute(
-          &tmp1, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuDevice);
-      checkForCudaError(err);
-      m_totalconstantmemory.push_back(tmp1);
+      int tmp(0);
+      m_maxthreadsperblock.push_back(device_properties.maxThreadsPerBlock);
+      m_maxthreadsdim.push_back(
+          std::array<int, 3>{device_properties.maxThreadsDim[0],
+                             device_properties.maxThreadsDim[1],
+                             device_properties.maxThreadsDim[2]});
+      m_maxgridsize.push_back(std::array<int, 3>{
+          device_properties.maxGridSize[0], device_properties.maxGridSize[1],
+          device_properties.maxGridSize[2]});
+      m_sharedmemperblock.push_back(device_properties.sharedMemPerBlock);
 
       err =
-          cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+          cudaDeviceGetAttribute(&tmp, cudaDevAttrTotalConstantMemory, device);
       checkForCudaError(err);
-      m_simdwidth.push_back(tmp1);
+      m_totalconstantmemory.push_back(tmp);
 
-      err =
-          cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_MAX_PITCH, cuDevice);
-      checkForCudaError(err);
-      m_mempitch.push_back(tmp1);
-
-      err = cuDeviceGetAttribute(
-          &tmp1, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, cuDevice);
-      checkForCudaError(err);
-      m_regsperblock.push_back(tmp1);
+      m_simdwidth.push_back(device_properties.warpSize);
+      m_mempitch.push_back(device_properties.memPitch);
 
       err =
-          cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuDevice);
+          cudaDeviceGetAttribute(&tmp, cudaDevAttrMaxRegistersPerBlock, device);
       checkForCudaError(err);
-      m_clockrate.push_back(tmp1);
+      m_regsperblock.push_back(tmp);
 
-      err = cuDeviceGetAttribute(&tmp1, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                                 cuDevice);
+      err = cudaDeviceGetAttribute(&tmp, cudaDevAttrClockRate, device);
       checkForCudaError(err);
-      m_texturealign.push_back(tmp1);
-    }
-
-    // CUdevprop_st properties;
-    // err = cuDeviceGetProperties(&properties, cuDevice);
-    // checkForCudaError(err);
-    // deviceProperties.push_back(properties);
-  }
+      m_clockrate.push_back(tmp);
 
-  // Save the handle and context for each device
-  device_handle.resize(2);
-  context.resize(2);
-  for (int device = 0; device < deviceCount; device++) {
-    // Only initialize if compute capability >= 2.0
-    if (computeCapability[device][0] >= 2) {
-      // Save the handle for each device
-      err = cuDeviceGet(&device_handle[device], device);
-      checkForCudaError(err);
-
-      // Save the handle for each context
-      err = cuCtxCreate(&context[device], device, device_handle[device]);
+      err = cudaDeviceGetAttribute(&tmp, cudaDevAttrTextureAlignment, device);
       checkForCudaError(err);
+      m_texturealign.push_back(tmp);
     }
   }
 }
 
-/*!
- * \brief destructor
- *
- * Free the device context and unload any modules.
- */
-GPU_Device::~GPU_Device() {
-  // Free reserved contexts:
-  for (int device = 0; device < deviceCount; device++) {
-    // Only initialize if compute capability >= 2.0
-    if (computeCapability[device][0] >= 2) {
-      cudaError_enum err = cuCtxDestroy(context[device]);
-      checkForCudaError(err);
-    }
-  }
-}
+GPU_Device::~GPU_Device() {}
 
 //---------------------------------------------------------------------------//
 // Print a summary of all GPU devices found
@@ -205,11 +138,11 @@ void GPU_Device::printDeviceSummary(int const idevice,
  * For optimized builds with DRACO_DBC_LEVEL=0, this function will be empty
  * and any decent compiler will optimize this call away.
  */
-void GPU_Device::checkForCudaError(cudaError_enum const errorCode) {
+void GPU_Device::checkForCudaError(cudaError_t const errorCode) {
   std::ostringstream msg;
   msg << "A CUDA call returned the error: \"" << getErrorMessage(errorCode)
       << "\"";
-  Insist(errorCode == CUDA_SUCCESS, msg.str());
+  Insist(errorCode == cudaSuccess, msg.str());
 }
 
 #else
@@ -218,7 +151,7 @@ void GPU_Device::checkForCudaError(cudaError_enum const errorCode) {
  * \brief Convert a CUDA return enum value into a descriptive string.
  * \return descriptive string associated with
  */
-void GPU_Device::checkForCudaError(cudaError_enum const) { /* empty */
+void GPU_Device::checkForCudaError(cudaError_t const) { /* empty */
 }
 #endif
 
@@ -226,144 +159,133 @@ void GPU_Device::checkForCudaError(cudaError_enum const) { /* empty */
 /*!
  * \brief Return a text string that corresponds to a CUDA error enum.
  */
-std::string GPU_Device::getErrorMessage(cudaError_enum const err) {
-  std::string message;
+std::string GPU_Device::getErrorMessage(cudaError_t const err) {
+  auto raw_message = cudaGetErrorString(err);
+  std::string message(raw_message);
+
+  /*
   switch (err) {
-  case CUDA_SUCCESS:
+  case cudaSuccess:
     message = std::string("No errors.");
     break;
-  case CUDA_ERROR_INVALID_VALUE:
+  case cudaErrorInvalidValue:
     message = std::string("Invalid value.");
     break;
-  case CUDA_ERROR_OUT_OF_MEMORY:
+  case cudaErrorMemoryAllocation:
     message = std::string("Out of memory.");
     break;
-  case CUDA_ERROR_NOT_INITIALIZED:
+  case cudaErrorInitializationError:
     message = std::string("Driver not initialized.");
     break;
-  case CUDA_ERROR_DEINITIALIZED:
+  case cudaErrorDeviceUninitilialized:
     message = std::string("Driver deinitialized.");
     break;
-  case CUDA_ERROR_NO_DEVICE:
+  case cudaErrorNoDevice:
     message = std::string("No CUDA-capable device available.");
     break;
-  case CUDA_ERROR_INVALID_DEVICE:
+  case cudaErrorInvalidDevice:
     message = std::string("Invalid device.");
     break;
-  case CUDA_ERROR_INVALID_IMAGE:
+  case cudaErrorInvalidKernelImage:
     message = std::string("Invalid kernel image.");
     break;
-  case CUDA_ERROR_INVALID_CONTEXT:
+  case cudaErrorIncompatibleDriverContext:
     message = std::string("Invalid context.");
     break;
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-    message = std::string("Context already current.");
-    break;
-  case CUDA_ERROR_MAP_FAILED:
+  case cudaErrorMapBufferObjectFailed:
     message = std::string("Map failed.");
     break;
-  case CUDA_ERROR_UNMAP_FAILED:
+  case cudaErrorUnmapBufferObjectFailed:
     message = std::string("Unmap failed.");
     break;
-  case CUDA_ERROR_ARRAY_IS_MAPPED:
+  case cudaErrorArrayIsMapped:
     message = std::string("Array is mapped.");
     break;
-  case CUDA_ERROR_ALREADY_MAPPED:
+  case cudaErrorAlreadyMapped:
     message = std::string("Already mapped.");
     break;
-  case CUDA_ERROR_NO_BINARY_FOR_GPU:
-    message = std::string("No binary for GPU.");
-    break;
-  case CUDA_ERROR_ALREADY_ACQUIRED:
+  case cudaErrorAlreadyAcquired:
     message = std::string("Already acquired.");
     break;
-  case CUDA_ERROR_NOT_MAPPED:
+  case cudaErrorNotMapped:
     message = std::string("Not mapped.");
     break;
-  case CUDA_ERROR_INVALID_SOURCE:
+  case cudaErrorInvalidSource:
     message = std::string("Invalid source.");
     break;
-  case CUDA_ERROR_FILE_NOT_FOUND:
+  case cudaErrorFileNotFound:
     message = std::string("File not found.");
     break;
-  case CUDA_ERROR_INVALID_HANDLE:
+  case cudaErrorInvalidResourceHandle:
     message = std::string("Invalid handle.");
     break;
-  case CUDA_ERROR_NOT_FOUND:
-    message = std::string("Not found.");
-    break;
-  case CUDA_ERROR_NOT_READY:
+  case cudaErrorNotReady:
     message = std::string("CUDA not ready.");
     break;
-  case CUDA_ERROR_LAUNCH_FAILED:
+  case cudaErrorLaunchFailure:
     message = std::string("Launch failed.");
     break;
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+  case cudaErrorLaunchOutOfResources:
     message = std::string("Launch exceeded resources.");
     break;
-  case CUDA_ERROR_LAUNCH_TIMEOUT:
+  case cudaErrorLaunchTimeout:
     message = std::string("Launch exceeded timeout.");
     break;
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+  case cudaErrorLaunchIncompatibleTexturing:
     message = std::string("Launch with incompatible texturing.");
     break;
-  case CUDA_ERROR_UNKNOWN:
+  case cudaErrorUnknown:
     message = std::string("Unknown error. ");
     break;
   default:
-    // CUDA_ERROR_PROFILER_DISABLED
-    // CUDA_ERROR_PROFILER_NOT_INITIALIZED
-    // CUDA_ERROR_PROFILER_ALREADY_STARTED
-    // CUDA_ERROR_PROFILER_ALREADY_STOPPED
-    // CUDA_ERROR_NOT_MAPPED_AS_ARRAY
-    // CUDA_ERROR_NOT_MAPPED_AS_POINTER
-    // CUDA_ERROR_ECC_UNCORRECTABLE
-    // CUDA_ERROR_UNSUPPORTED_LIMIT
-    // CUDA_ERROR_CONTEXT_ALREADY_IN_USE
-    // CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
-    // CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
-    // CUDA_ERROR_OPERATING_SYSTEM
-    // CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
-    // CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
-    // CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED
-    // CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED
-    // CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
-    // CUDA_ERROR_CONTEXT_IS_DESTROYED
+    // cudaErrorProfilerDisabled
+    // cudaErrorNotMappedAsArray
+    // cudaErrorNotMappedAsPointer
+    // cudaErrorECCUncorrectable
+    // cudaErrorUnsupportedLimit
+    // cudaErrorSharedObjectSymbolNotFound
+    // cudaErrorSharedObjectInitFailed
+    // cudaErrorOperatingSystem
+    // cudaErrorPeerAccessAlreadyEnabled
+    // cudaErrorPeerAccessUnsupported
+    // cudaErrorPeerAccessNotEnabled
+    // cudaErrorContextIsDestroyed
     message = std::string("Unknown error. ");
     break;
   }
+  */
   return message;
 }
 
 //---------------------------------------------------------------------------//
 /*!
- * \brief Wrap the cuMemAlloc funtion to include error checking
+ * \brief Wrap the cudaMemAlloc funtion to include error checking
  *
  * \param nbytes number of bytes to allocate (e.g.: len*sizeof(double) ).
  * \return GPU device pointer to allocated memory.
  */
 
-CUdeviceptr GPU_Device::MemAlloc(unsigned const nbytes) {
-  CUdeviceptr ptr;
-  cudaError_enum err = cuMemAlloc(&ptr, nbytes);
+void *GPU_Device::MemAlloc(unsigned const nbytes) {
+  void *ptr;
+  cudaError_t err = cudaMalloc(&ptr, nbytes);
   checkForCudaError(err);
   return ptr;
 }
 
-void GPU_Device::MemcpyHtoD(CUdeviceptr ptr, void const *loc, unsigned nbytes) {
-  cudaError_enum err = cuMemcpyHtoD(ptr, loc, nbytes);
+void GPU_Device::MemcpyHtoD(void *ptr, void const *loc, unsigned nbytes) {
+  cudaError_t err = cudaMemcpy(ptr, loc, nbytes, cudaMemcpyHostToDevice);
   checkForCudaError(err);
   return;
 }
 
-void GPU_Device::MemcpyDtoH(void *loc, CUdeviceptr ptr, unsigned nbytes) {
-  cudaError_enum err = cuMemcpyDtoH(loc, ptr, nbytes);
+void GPU_Device::MemcpyDtoH(void *loc, void *ptr, unsigned nbytes) {
+  cudaError_t err = cudaMemcpy(loc, ptr, nbytes, cudaMemcpyDeviceToHost);
   checkForCudaError(err);
   return;
 }
 
-void GPU_Device::MemFree(CUdeviceptr ptr) {
-  cudaError_enum err = cuMemFree(ptr);
+void GPU_Device::MemFree(void *ptr) {
+  cudaError_t err = cudaFree(ptr);
   checkForCudaError(err);
   return;
 }
diff --git a/src/device/GPU_Device.cmake b/src/device/GPU_Device.cmake
index 372377c394..db22476f2e 100644
--- a/src/device/GPU_Device.cmake
+++ b/src/device/GPU_Device.cmake
@@ -15,6 +15,7 @@
 
 set( TEST_KERNEL_BINDIR ${PROJECT_BINARY_DIR}/test CACHE PATH
   "GPU kernel binary install location" )
+set( CUDA_DEVICE ON)
 configure_file( config.h.in ${PROJECT_BINARY_DIR}/device/config.h )
 
 # ---------------------------------------------------------------------------- #
@@ -23,12 +24,12 @@ configure_file( config.h.in ${PROJECT_BINARY_DIR}/device/config.h )
 
 set( sources
    GPU_Device.cc
-   GPU_Module.cc )
+   )
 set( headers
    GPU_Device.hh
-   GPU_Module.hh
    ${PROJECT_BINARY_DIR}/device/config.h
-   device_cuda.h )
+   device_cuda.h
+   )
 
 # ---------------------------------------------------------------------------- #
 # Build package library
@@ -38,11 +39,14 @@ add_component_library(
    TARGET       Lib_device
    TARGET_DEPS  Lib_dsxx
    LIBRARY_NAME device
+   LIBRARY_TYPE STATIC
    SOURCES      "${sources}"
    HEADERS      "${headers}" )
 target_include_directories( Lib_device
   PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}> )
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}>
+  )
+set_property(TARGET Lib_device PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
 # ---------------------------------------------------------------------------- #
 # Installation instructions
diff --git a/src/device/GPU_Device.hh b/src/device/GPU_Device.hh
index 45189d76a2..f7a76e7d56 100644
--- a/src/device/GPU_Device.hh
+++ b/src/device/GPU_Device.hh
@@ -48,10 +48,7 @@ namespace rtt_device {
  * \endcode
  *
  * \example device/test/gpu_hello_rt_api.cu
- * Test of GPU_Device for CUDA runtime environment.
- *
- * \example device/test/gpu_hello_driver_api.cc
- * Test of GPU_Device for CUDA driver environment.
+ * Test of GPU_Device for with CUDA runtime API.
  */
 //===========================================================================//
 
@@ -106,17 +103,6 @@ public:
    *  textureAlign bytes do not need an offset applied to texture fetches */
   int textureAlign(int devId = 0) const { return m_texturealign[devId]; }
 
-  //! Return the device handle
-  CUdevice deviceHandle(int idevice) const {
-    Require(idevice < deviceCount);
-    return device_handle[idevice];
-  }
-  //! Return the context handle
-  CUcontext contextHandle(int idevice) const {
-    Require(idevice < deviceCount);
-    return context[idevice];
-  }
-
   // SERVICES
   //! Print a summary of idevice's features to ostream out.
   void printDeviceSummary(int const idevice,
@@ -127,17 +113,17 @@ public:
     return (offset + alignment - 1) & ~(alignment - 1);
   }
   //! Check cuda return code and throw an Insist on error.
-  static std::string getErrorMessage(cudaError_enum const err);
+  static std::string getErrorMessage(cudaError_t const err);
   //! Check the value of the return code for CUDA calls.
-  static void checkForCudaError(cudaError_enum const errorCode);
+  static void checkForCudaError(cudaError_t const errorCode);
   //! Wrap the cuMemAlloc call to include error checking
-  static CUdeviceptr MemAlloc(unsigned const nbytes);
+  static void *MemAlloc(unsigned const nbytes);
   //! Wrap cuMemcpyHtoD() to include error checking.
-  static void MemcpyHtoD(CUdeviceptr ptr, void const *loc, unsigned nbytes);
+  static void MemcpyHtoD(void *ptr, void const *loc, unsigned nbytes);
   //! Wrap cuMemcpyDtoH() to include error checking.
-  static void MemcpyDtoH(void *loc, CUdeviceptr ptr, unsigned nbytes);
+  static void MemcpyDtoH(void *loc, void *ptr, unsigned nbytes);
   //! Wrap cuMemFree() to include error checking.
-  static void MemFree(CUdeviceptr ptr);
+  static void MemFree(void *ptr);
 
 protected:
   // IMPLEMENTATION
@@ -163,16 +149,6 @@ private:
   std::vector<int> m_regsperblock;
   std::vector<int> m_clockrate;
   std::vector<int> m_texturealign;
-
-  //! Device handles (one per device)
-  std::vector<CUdevice> device_handle;
-  /*! Device context (one per handle)
-   *
-   * Current implementation only allows 1 context per GPU.  However, the CUDA
-   * Driver API provides for the concept of pushing and poping various contexts
-   * on the GPU.
-   */
-  std::vector<CUcontext> context;
 };
 
 } // end namespace rtt_device
diff --git a/src/device/GPU_Module.cc b/src/device/GPU_Module.cc
deleted file mode 100644
index 2d53250ab6..0000000000
--- a/src/device/GPU_Module.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/GPU_Module.cc
- * \brief  
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//---------------------------------------------------------------------------//
-
-#include "GPU_Module.hh"
-#include "GPU_Device.hh"
-#include "device/config.h"
-#include "ds++/path.hh"
-#include <sstream>
-
-namespace rtt_device {
-
-//---------------------------------------------------------------------------//
-// Constructor
-//---------------------------------------------------------------------------//
-/*!
- * \brief Constructor
- * \arg myPtxFile - the name of a ptx or cubin file.  Ptx files will be
- * compiled at runtime into cubins.  Prefer the use of cubin to ruduce runtime.
- *
- * Create a GPU_Module object.
- */
-GPU_Module::GPU_Module(std::string const &myPtxFile)
-    : ptxFile(findPtxFile(myPtxFile)) {
-  // load the module
-  cudaError_enum err = cuModuleLoad(&cuModule, ptxFile.c_str());
-  rtt_device::GPU_Device::checkForCudaError(err);
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * \brief destructor
- *
- * Free the the loaded modules.
- */
-GPU_Module::~GPU_Module() {
-  // Unload the module
-  cudaError_enum err = cuModuleUnload(cuModule);
-  rtt_device::GPU_Device::checkForCudaError(err);
-}
-
-//---------------------------------------------------------------------------//
-/*! 
- * \brief findPtxFile
- * 
- * \param myPtxFile filename or path to the ptx file that is to be loaded.
- * \return fully qualified path to the disired ptx file
- */
-std::string GPU_Module::findPtxFile(std::string const &myPtxFile) {
-  Require(myPtxFile.length() > 0);
-
-  // Location of GPU ptx files - read from config.h
-  std::string const testDir(rtt_device::test_kernel_bindir);
-  // return value
-  std::string ptxFile;
-
-  // std::cout << "Looking at:\n"
-  //           << myPtxFile << "\n"
-  //           << testDir + std::string("/") + myPtxFile << std::endl;
-
-  // Find the ptx file
-  if (rtt_dsxx::fileExists(myPtxFile))
-    ptxFile = myPtxFile;
-  else if (rtt_dsxx::fileExists(std::string("../") + myPtxFile))
-    ptxFile = std::string("../") + myPtxFile;
-  else if (rtt_dsxx::fileExists(testDir + std::string("/") + myPtxFile))
-    ptxFile = testDir + std::string("/") + myPtxFile;
-
-  Insist(rtt_dsxx::fileExists(ptxFile),
-         (std::string("Cannot find requested file: ") + myPtxFile).c_str());
-
-  return ptxFile;
-}
-
-//---------------------------------------------------------------------------//
-/*! 
- * \brief Find a function in the current module and return a handle.
- * 
- * \param functionName the name of the CUDA function difined in the ptx
- *                     module. 
- * \return a CUfunction handle that points to the requested function.
- */
-CUfunction
-GPU_Module::getModuleFunction(std::string const &functionName) const {
-  // Load the kernel from the module
-  CUfunction kernel;
-  cudaError_enum err =
-      cuModuleGetFunction(&kernel, cuModule, functionName.c_str());
-  GPU_Device::checkForCudaError(err);
-  return kernel;
-}
-
-} // end namespace rtt_device
-
-//---------------------------------------------------------------------------//
-// end of GPU_Module.cc
-//---------------------------------------------------------------------------//
diff --git a/src/device/GPU_Module.hh b/src/device/GPU_Module.hh
deleted file mode 100644
index ea03f13614..0000000000
--- a/src/device/GPU_Module.hh
+++ /dev/null
@@ -1,82 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/GPU_Module.hh
- * \author Kelly (KT) Thompson
- * \brief  Define class GPU_Module
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//---------------------------------------------------------------------------//
-
-#ifndef device_GPU_Module_hh
-#define device_GPU_Module_hh
-
-#include "device_cuda.h"
-#include <string>
-
-namespace rtt_device {
-
-//===========================================================================//
-/*!
- * \class GPU_Module
- * \brief
- *
- * CUDA modules are dynamically loadable packages of device code and data
- * (akin to dynamic shared libraries)
- *
- * \sa GPU_Module.cc for detailed descriptions.
- *
- * \par Code Sample:
- * \code
- *     cout << "Hello, world." << endl;
- * \endcode
- *
- * \sa device/test/gpu_hello_driver_api.cc
- */
-//===========================================================================//
-
-class GPU_Module {
-public:
-  // NESTED CLASSES AND TYPEDEFS
-
-  // CREATORS
-
-  //! Default constructors.
-  GPU_Module(std::string const &myPtxFile);
-
-  //! Copy constructor (the long doxygen description is in the .cc file).
-  // GPU_Module(const GPU_Module &rhs);
-
-  //! Destructor.
-  ~GPU_Module();
-
-  // MANIPULATORS
-
-  //! Assignment operator for GPU_Module.
-  // GPU_Module& operator=(const GPU_Module &rhs);
-
-  // ACCESSORS
-  CUmodule handle(void) { return cuModule; }
-
-  // SERVICES
-  CUfunction getModuleFunction(std::string const &functionName) const;
-
-  // IMPLEMENTATION
-  static std::string findPtxFile(std::string const &myPtxFile);
-
-private:
-  // NESTED CLASSES AND TYPEDEFS
-
-  // IMPLEMENTATION
-
-  // DATA
-  std::string const ptxFile;
-  CUmodule cuModule;
-};
-
-} // end namespace rtt_device
-
-#endif // device_GPU_Module_hh
-
-//---------------------------------------------------------------------------//
-// end of device/GPU_Module.hh
-//---------------------------------------------------------------------------//
diff --git a/src/device/config.h.in b/src/device/config.h.in
index 198b0fce42..ed494b8868 100644
--- a/src/device/config.h.in
+++ b/src/device/config.h.in
@@ -12,6 +12,12 @@
 #define device_config_h
 
 #include <string>
+#cmakedefine CUDA_DEVICE
+#ifdef CUDA_DEVICE
+  #define HOST_AND_DEVICE_FUNCTION __host__ __device__
+#else
+  #define HOST_AND_DEVICE_FUNCTION 
+#endif
 
 namespace rtt_device
 {
diff --git a/src/device/device_cuda.h b/src/device/device_cuda.h
index 69ab29ee66..e42388c93b 100644
--- a/src/device/device_cuda.h
+++ b/src/device/device_cuda.h
@@ -27,6 +27,7 @@
 #endif
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 
 #if defined __GNUC__
 #pragma GCC system_header
diff --git a/src/device/test/Dual_Call.cu b/src/device/test/Dual_Call.cu
new file mode 100644
index 0000000000..8283aac641
--- /dev/null
+++ b/src/device/test/Dual_Call.cu
@@ -0,0 +1,112 @@
+///----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/Dual_Call.cu
+ * \author Alex R. Long
+ * \date   Mon Mar 25 2019
+ * \brief  Show how code can be called from GPU and host
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved.
+ */
+//---------------------------------------------------------------------------//
+
+#include "Dual_Call.hh"
+
+namespace rtt_device_test {
+
+//---------------------------------------------------------------------------//
+/*!
+ * \brief Calculate the number of source particles for a range of cells
+ *
+ * \param[in] part_per_e energy for this source
+ * \param[in] max_particles_pspc max partices per species
+ * \param[in] cell_start starting cell index
+ * \param[in] cell_end ending cell index
+ * \param[in] e_field energy in a cell
+ * \param[in] src_cell_bias bias in a cell
+ * \param[in,out] n_field destination for particles in a cell
+ * \param[out] return number of particles over this cell range
+ */
+__host__ __device__ unsigned long long sub_conserve_calc_num_src_particles(
+    const double part_per_e, unsigned max_particles_pspc,
+    const size_t cell_start, const size_t cell_end, const double *e_field,
+    const double *src_cell_bias, int *n_field) {
+  unsigned long long ntot = 0;
+
+  ntot = 0;
+
+  // sweep through cells and calculate number of particles per cell
+  for (size_t cell = cell_start; cell < cell_end; cell++) {
+    // if the cell has any energy try to put some particles in it
+    if (e_field[cell] > 0.0) {
+      // get estimate of number of particles per cell to nearest
+      // integer per species, a cell-based bias can be added that simply
+      // multiplies the expected number by a user defined bias; the
+      // energy balance will still be correct because particles will
+      // simply be subtracted from other cells to compensate
+      const double d_num = e_field[cell] * part_per_e * src_cell_bias[cell];
+      //Check(d_num > 0.0);
+      // Check( d_num < static_cast<double>(max_particles_pspc) );
+
+      // We are about to cast d_num back to int.  Ensure that the
+      // conversion is valid.  If not, set the number of particles to
+      // the ceiling value provided in Source.hh.
+      if (d_num < static_cast<double>(max_particles_pspc - 1)) {
+        n_field[cell] = static_cast<int>(d_num + 0.5);
+
+        // try to get at least one particle per cell per species
+        if (n_field[cell] == 0)
+          n_field[cell] = 1;
+
+      } else {
+        n_field[cell] = max_particles_pspc;
+      }
+
+      // increment particle counter (uint64_t += int)
+      ntot += n_field[cell];
+    } else
+      n_field[cell] = 0;
+  }
+  return ntot;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * \brief Launch a kernel to calculate the number of source particles
+ *
+ * \param[in] part_per_e energy for this source
+ * \param[in] max_particles_pspc max partices per species
+ * \param[in] cont_size size of all fields
+ * \param[in] e_field energy in a cell
+ * \param[in] src_cell_bias bias in a cell
+ * \param[in,out] n_field destination for particles in a cell
+ * \param[in,out] ntot total particles per thread block
+ */
+__global__ void cuda_conserve_calc_num_src_particles(
+    const double part_per_e, unsigned max_particles_pspc, int cont_size,
+    const double *e_field, const double *src_cell_bias, int *n_field,
+    unsigned long long *ntot) {
+
+  __shared__ unsigned long long shared_data[512];
+  size_t cell_start = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t cell_end = cell_start + 1;
+  if (cell_start < cont_size) {
+    shared_data[threadIdx.x] = sub_conserve_calc_num_src_particles(
+        part_per_e, max_particles_pspc, cell_start, cell_end, e_field,
+        src_cell_bias, n_field);
+  } else
+    shared_data[threadIdx.x] = 0;
+  __syncthreads();
+  for (unsigned int s = 1; s < blockDim.x; ++s) {
+    if (s == threadIdx.x)
+      shared_data[0] += shared_data[s];
+    __syncthreads();
+  }
+  __syncthreads();
+  ntot[blockIdx.x] = shared_data[0];
+}
+
+} // namespace rtt_device_test
+
+//---------------------------------------------------------------------------//
+// end of device/test/Dual_Call.cc
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/Dual_Call.hh b/src/device/test/Dual_Call.hh
new file mode 100644
index 0000000000..b2b4b97860
--- /dev/null
+++ b/src/device/test/Dual_Call.hh
@@ -0,0 +1,39 @@
+///----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/Dual_Call.hh
+ * \author Alex R. Long
+ * \date   Mon Mar 25 2019
+ * \brief  Show how code can be called from GPU and host
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved.
+ */
+//---------------------------------------------------------------------------//
+
+#ifndef rtt_device_test_Dual_Call_hh
+#define rtt_device_test_Dual_Call_hh
+
+#include "device/config.h"
+#include <algorithm>
+#include <numeric>
+#include <stdio.h>
+#include <vector>
+
+namespace rtt_device_test {
+
+__host__ __device__ unsigned long long sub_conserve_calc_num_src_particles(
+    const double part_per_e, unsigned max_particles_pspc,
+    const size_t cell_start, const size_t cell_end, const double *e_field,
+    const double *src_cell_bias, int *n_field);
+
+__global__ void cuda_conserve_calc_num_src_particles(
+    const double part_per_e, unsigned max_particles_pspc, int cont_size,
+    const double *e_field, const double *src_cell_bias, int *n_field,
+    unsigned long long *ntot);
+
+} // namespace rtt_device_test
+
+#endif // rtt_device_test_Dual_Call_hh
+
+//---------------------------------------------------------------------------//
+// end of device/test/Dual_Call.hh
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/GPU_Device.cmake b/src/device/test/GPU_Device.cmake
index 490ff11f4a..587a60a6fd 100644
--- a/src/device/test/GPU_Device.cmake
+++ b/src/device/test/GPU_Device.cmake
@@ -11,37 +11,42 @@
 
 set( test_sources
   gpu_hello_rt_api.cu
-  # The low level Driver-API is not really supported by NVidia anymore.
-  # gpu_hello_driver_api.cc
-  )
+  gpu_device_info.cu
+  gpu_dual_call_test.cu
+)
 
-# set( cuda_sources
-#   gpu_kernel.cu
-#   vector_add.cu
-#   )
+set(cuda_headers
+  basic_kernels.hh
+  Dual_Call.hh
+)
+
+set(cuda_sources
+  basic_kernels.cu
+  Dual_Call.cu
+)
 
 # ---------------------------------------------------------------------------- #
 # Build Unit tests
 # ---------------------------------------------------------------------------- #
 
 # Stuff cuda code into a test library.
-# add_library( Lib_device_test SHARED ${cuda_sources} )
-
-#add_executable( Ut_gpu_hello_driver_api_exe gpu_hello_driver_api.cc )
-#target_link_libraries( Ut_gpu_hello_driver_api_exe Lib_device )
-
-add_executable( Ut_gpu_hello_rt_api_exe gpu_hello_rt_api.cu )
-target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_dsxx )
-# target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_device_test Lib_dsxx )
-#target_link_libraries( Ut_gpu_hello_rt_api_exe Lib_device )
+add_component_library(
+   TARGET       Lib_device_test
+   TARGET_DEPS  Lib_dsxx Lib_device
+   LIBRARY_NAME device_test
+   LIBRARY_TYPE STATIC
+   SOURCES      "${cuda_sources}"
+   HEADERS      "${cuda_headers}" )
+set_property(TARGET Lib_device_test PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
 # We need to explicitly state that we need all CUDA files in the particle
 # library to be built with -dc as the member functions could be called by
 # other libraries and executables
-set_target_properties( Ut_gpu_hello_rt_api_exe PROPERTIES
-  CUDA_SEPARABLE_COMPILATION ON )
-target_include_directories( Ut_gpu_hello_rt_api_exe
-  PRIVATE $<BUILD_INTERFACE:${draco_src_dir_SOURCE_DIR}> )
+set_target_properties( Lib_device_test PROPERTIES
+ CUDA_SEPARABLE_COMPILATION ON )
+
+#target_include_directories( Ut_gpu_hello_rt_api_exe
+# PRIVATE $<BUILD_INTERFACE:${draco_src_dir_SOURCE_DIR}> )
 
 #          $<BUILD_INTERFACE:${dsxx_BINARY_DIR}> )
 #  PUBLIC $<BUILD_INTERFACE:${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}> )
@@ -54,16 +59,17 @@ target_include_directories( Ut_gpu_hello_rt_api_exe
 # Register Unit tests
 # ---------------------------------------------------------------------------- #
 
-# set( test_deps
-#    Lib_device
-#    ${CUDA_CUDA_LIBRARY} )
-# add_scalar_tests(
-#    SOURCES  "${test_sources}"
-#    DEPS     "${test_deps}"
-#    )
+ set( test_deps
+    Lib_device
+    Lib_device_test)
+
+ add_scalar_tests(
+    SOURCES  "${test_sources}"
+    DEPS     "${test_deps}"
+    )
 
-add_test( NAME device_gpu_hello_rt_api
-  COMMAND $<TARGET_FILE:Ut_gpu_hello_rt_api_exe> )
+#add_test( NAME device_gpu_hello_rt_api
+#  COMMAND $<TARGET_FILE:Ut_gpu_hello_rt_api_exe> )
 set_tests_properties( device_gpu_hello_rt_api PROPERTIES
    PASS_REGULAR_EXPRESSION ".*[Tt]est: PASSED"
    FAIL_REGULAR_EXPRESSION ".*[Tt]est: FAILED" )
diff --git a/src/device/test/basic_kernels.cu b/src/device/test/basic_kernels.cu
new file mode 100644
index 0000000000..68fe0966a8
--- /dev/null
+++ b/src/device/test/basic_kernels.cu
@@ -0,0 +1,48 @@
+//----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/basic_kernels.cu
+ * \author Kelly Thompson
+ * \date
+ * \brief  Small kernel code for testing GPU Device framework.
+ * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
+ *         All rights reserved. */
+//---------------------------------------------------------------------------//
+
+namespace rtt_device_test {
+
+//---------------------------------------------------------------------------//
+/*!
+ * \brief CUDA kernel for adding two numbers
+ *
+ * \param[in,out] dest location to store sum
+ * \param[in] a value to add
+ * \param{in] b value to add
+ */
+__global__ void sum(int *dest, int a, int b) {
+  // Assuming a single thread, 1x1x1 block, 1x1 grid
+  *dest = a + b;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * \brief CUDA kernel for adding two vectors
+ *
+ * \param[in] A_dev vector to add
+ * \param[in] B_dev vector to add
+ * \param{in,out] C_dev location to store solution vector
+ * \param{in] N length of vectors
+ */
+__global__ void vector_add(double const *A_dev, double const *B_dev,
+                           double *C_dev, int const N) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  /* if(i%512==0)
+         *   printf("index %d\n",i); */
+  if (i < N)
+    C_dev[i] = A_dev[i] + B_dev[i];
+}
+
+} // namespace rtt_device_test
+
+//---------------------------------------------------------------------------//
+// end of basic_kernels.cu
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/basic_kernels.hh b/src/device/test/basic_kernels.hh
new file mode 100644
index 0000000000..7f6996aad3
--- /dev/null
+++ b/src/device/test/basic_kernels.hh
@@ -0,0 +1,34 @@
+///----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/basic_kernels.hh
+ * \author Alex R. Long
+ * \date   Mon Mar 25 2019
+ * \brief  Simple kernels for basic GPU tests
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved.
+ */
+//---------------------------------------------------------------------------//
+
+#ifndef rtt_device_test_basic_kernels_hh
+#define rtt_device_test_basic_kernels_hh
+
+#include "device/config.h"
+#include <algorithm>
+#include <numeric>
+#include <stdio.h>
+#include <vector>
+
+namespace rtt_device_test {
+
+__global__ void vector_add(double const *A_dev, double const *B_dev,
+                           double *C_dev, int const N);
+
+__global__ void sum(int *dest, int a, int b);
+
+} // namespace rtt_device_test
+
+#endif // rtt_device_test_basic_kernels_hh
+
+//---------------------------------------------------------------------------//
+// end of device/test/basic_kernels.hh
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/gpu_device_info.cu b/src/device/test/gpu_device_info.cu
new file mode 100644
index 0000000000..712b0ada6b
--- /dev/null
+++ b/src/device/test/gpu_device_info.cu
@@ -0,0 +1,77 @@
+//----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/gpu_device_info.cc
+ * \author Alex Long
+ * \date   Thu Mar 21 15:28:48 2019
+ * \brief  Simple test of the CUDA Runtime API through the GPU_Device object
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved. */
+//---------------------------------------------------------------------------//
+
+#include "device/GPU_Device.hh"
+#include "device/config.h"
+#include "ds++/Assert.hh"
+#include "ds++/DracoStrings.hh"
+#include "ds++/Release.hh"
+#include "ds++/ScalarUnitTest.hh"
+#include "ds++/Soft_Equivalence.hh"
+#include "ds++/SystemCall.hh"
+#include "ds++/path.hh"
+#include <iostream>
+#include <sstream>
+#include <time.h>
+
+//---------------------------------------------------------------------------//
+// query_device
+//---------------------------------------------------------------------------//
+
+void query_device(rtt_dsxx::ScalarUnitTest &ut) {
+  using namespace std;
+
+  cout << "Starting gpu_hello_driver_api::query_device()...\n" << endl;
+
+  // Create a GPU_Device object.
+  // Initialize the CUDA library and sets device and context handles.
+  rtt_device::GPU_Device gpu;
+
+  // Create and then print a summary of the devices found.
+  std::ostringstream out;
+  size_t const numDev(gpu.numDevicesAvailable());
+  out << "GPU device summary:\n\n"
+      << "   Number of devices found: " << numDev << "\n"
+      << endl;
+  for (size_t device = 0; device < numDev; ++device)
+    gpu.printDeviceSummary(device, out);
+
+  // Print the message to stdout
+  cout << out.str();
+
+  // Parse the output
+  bool verbose(false);
+  std::map<std::string, unsigned> wordCount =
+      rtt_dsxx::get_word_count(out, verbose);
+
+  FAIL_IF_NOT(wordCount[string("Device")] == numDev);
+  // successful test output
+  if (ut.numFails == 0)
+    PASSMSG("gpu_device_info_test query_device test OK.");
+  return;
+}
+
+//---------------------------------------------------------------------------//
+// Main
+//---------------------------------------------------------------------------//
+
+int main(int argc, char *argv[]) {
+  using namespace std;
+
+  rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release);
+  try {
+    query_device(ut);
+  }
+  UT_EPILOG(ut);
+}
+
+//---------------------------------------------------------------------------//
+// end of gpu_device_info.cc
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/gpu_dual_call_test.cu b/src/device/test/gpu_dual_call_test.cu
new file mode 100644
index 0000000000..cf91f9070e
--- /dev/null
+++ b/src/device/test/gpu_dual_call_test.cu
@@ -0,0 +1,130 @@
+///----------------------------------*-C++-*----------------------------------//
+/*!
+ * \file   device/test/gpu_dual_call_test.cc
+ * \author Alex R. Long
+ * \date   Mon Mar 25 2019
+ * \brief  Show how code can be called from GPU and host
+ * \note   Copyright (C) 2019 Triad National Security, LLC.
+ *         All rights reserved.
+ */
+//---------------------------------------------------------------------------//
+
+#include "device/GPU_Device.hh"
+#include "device/test/Dual_Call.hh"
+#include "ds++/Release.hh"
+#include "ds++/ScalarUnitTest.hh"
+
+#include <iostream>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+using std::cout;
+using std::endl;
+using std::string;
+using std::vector;
+using namespace rtt_device;
+using namespace rtt_device_test;
+
+int dual_call_test(rtt_dsxx::ScalarUnitTest &ut) {
+
+  int n_cells = 1029;
+  vector<double> src_cell_bias(n_cells, 1.0);
+  vector<double> e_field(n_cells, 1.0);
+  vector<int> n_field(n_cells, 0);
+  unsigned long long device_n_tot = 0;
+  unsigned long long host_n_tot = 0;
+
+  const double part_per_e = 1.0;
+  const unsigned max_particles_pspc = 100;
+
+  constexpr int threads_per_block = 512;
+  int n_blocks = (n_cells + threads_per_block - 1) / threads_per_block;
+  // setup and copy all fields
+  vector<unsigned long long> n_tot_block(n_blocks, 0);
+  int *D_n_field = NULL;
+  double *D_e_field = NULL;
+  double *D_src_cell_bias = NULL;
+  unsigned long long *D_n_tot = NULL;
+  rtt_device::GPU_Device gpu;
+  cudaError_t err = cudaMalloc((void **)&D_n_field, n_cells * sizeof(int));
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMalloc((void **)&D_e_field, n_cells * sizeof(double));
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMalloc((void **)&D_src_cell_bias, n_cells * sizeof(double));
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMalloc((void **)&D_n_tot, n_blocks * sizeof(unsigned long long));
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMemcpy(D_n_field, &n_field[0], n_cells * sizeof(int),
+                   cudaMemcpyHostToDevice);
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMemcpy(D_e_field, &e_field[0], n_cells * sizeof(double),
+                   cudaMemcpyHostToDevice);
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  err = cudaMemcpy(D_src_cell_bias, &src_cell_bias[0], n_cells * sizeof(double),
+                   cudaMemcpyHostToDevice);
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+
+  dim3 blockSize;
+  dim3 gridSize;
+  blockSize.x = n_blocks;
+  blockSize.y = 1;
+  blockSize.z = 1;
+  gridSize.x = threads_per_block;
+  gridSize.y = 1;
+  gridSize.z = 1;
+  cuda_conserve_calc_num_src_particles<<<blockSize, gridSize>>>(
+      part_per_e, max_particles_pspc, n_cells, D_e_field, D_src_cell_bias,
+      D_n_field, D_n_tot);
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+  cudaDeviceSynchronize();
+  std::cout << gpu.getErrorMessage(cudaGetLastError()) << std::endl;
+
+  vector<double> e_field_out(n_cells, 0.0);
+  err = cudaMemcpy(&n_field[0], D_n_field, n_cells * sizeof(int),
+                   cudaMemcpyDeviceToHost);
+  std::cout << gpu.getErrorMessage(err) << std::endl;
+
+  err =
+      cudaMemcpy(&n_tot_block[0], D_n_tot,
+                 n_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+  std::cout << gpu.getErrorMessage(err) << std::endl;
+
+  err = cudaDeviceReset();
+  std::cout << gpu.getErrorMessage(err) << std::endl;
+
+  cudaFree(D_n_field);
+  cudaFree(D_e_field);
+  cudaFree(D_src_cell_bias);
+  cudaFree(D_n_tot);
+  // reduce the n_tot over all thread blocks
+  for (int i = 0; i < n_blocks; ++i)
+    device_n_tot += n_tot_block[i];
+
+  cout << "N total: " << device_n_tot << endl;
+
+  host_n_tot = sub_conserve_calc_num_src_particles(
+      part_per_e, max_particles_pspc, 0, n_cells, &e_field[0],
+      &src_cell_bias[0], &n_field[0]);
+
+  cout << "Host N total: " << host_n_tot << endl;
+  if (host_n_tot != device_n_tot)
+    FAILMSG(string("Host and device totals don't match!"));
+  else
+    PASSMSG("Host and device calls agree on value.");
+
+  return 0;
+}
+
+//----------------------------------------------------------------------------//
+int main(int argc, char *argv[]) {
+  rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release);
+  try {
+    dual_call_test(ut);
+  }
+  UT_EPILOG(ut);
+}
+
+//---------------------------------------------------------------------------//
+// end of device/Dual_Call.cc
+//---------------------------------------------------------------------------//
diff --git a/src/device/test/gpu_hello_driver_api.cc b/src/device/test/gpu_hello_driver_api.cc
deleted file mode 100644
index 0edc1401d9..0000000000
--- a/src/device/test/gpu_hello_driver_api.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/test/gpu_hello_driver_api.cc
- * \author Kelly (KT) Thompson
- * \date   Thu Oct 25 15:28:48 2011
- * \brief  Simple test of the CUDA Driver API.
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//---------------------------------------------------------------------------//
-
-#include "device/GPU_Device.hh"
-#include "device/GPU_Module.hh"
-#include "device/config.h"
-#include "ds++/Assert.hh"
-#include "ds++/DracoStrings.hh"
-#include "ds++/Release.hh"
-#include "ds++/ScalarUnitTest.hh"
-#include "ds++/Soft_Equivalence.hh"
-#include "ds++/SystemCall.hh"
-#include "ds++/path.hh"
-#include <cstdlib> // RAND_MAX
-#include <iostream>
-#include <sstream>
-#include <time.h>
-#include <vector>
-
-//---------------------------------------------------------------------------//
-// Helpers
-//---------------------------------------------------------------------------//
-
-void genTestData(std::vector<double> &a, std::vector<double> &b,
-                 std::vector<double> &ref) {
-  // Initialize the random seed
-  srand(time(NULL));
-
-  // Fill arrays
-  for (size_t i = 0; i < a.size(); ++i) {
-    a[i] = static_cast<double>(rand() % 1000);
-    b[i] = static_cast<double>(rand() % 1000);
-    ref[i] = a[i] + b[i];
-  }
-
-  return;
-}
-
-//---------------------------------------------------------------------------//
-// query_device
-//---------------------------------------------------------------------------//
-
-void query_device(rtt_dsxx::ScalarUnitTest &ut) {
-  using namespace std;
-
-  cout << "Starting gpu_hello_driver_api::query_device()...\n" << endl;
-
-  // Create a GPU_Device object.
-  // Initialize the CUDA library and sets device and context handles.
-  rtt_device::GPU_Device gpu;
-
-  // Create and then print a summary of the devices found.
-  std::ostringstream out;
-  size_t const numDev(gpu.numDevicesAvailable());
-  out << "GPU device summary:\n\n"
-      << "   Number of devices found: " << numDev << "\n"
-      << endl;
-  for (size_t device = 0; device < numDev; ++device)
-    gpu.printDeviceSummary(device, out);
-
-  // Print the message to stdout
-  cout << out.str();
-
-  // Parse the output
-  bool verbose(false);
-  std::map<std::string, unsigned> wordCount =
-      rtt_dsxx::get_word_count(out, verbose);
-
-  if (wordCount[string("Device")] == numDev)
-    ut.passes("Found a report for each available device.");
-  else
-    ut.failure("Did not find a report for each available device.");
-
-  return;
-}
-
-//---------------------------------------------------------------------------//
-// Test: simple_add
-//---------------------------------------------------------------------------//
-
-void simple_add(rtt_dsxx::ScalarUnitTest &ut) {
-  using namespace std;
-
-  cout << "\nStarting gpu_hello_driver_api::simple_add()...\n" << endl;
-
-  // Where are we?
-  cout << "Paths:"
-       << "\n   Current working dir = " << rtt_dsxx::draco_getcwd()
-       << "\n   GPU kernel files at = " << rtt_device::test_kernel_bindir
-       << endl;
-
-  // Create a GPU_Device object.
-  // Initialize the CUDA library and sets device and context handles.
-  rtt_device::GPU_Device gpu;
-
-  // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu
-  rtt_device::GPU_Module myModule("gpu_kernel.cubin");
-
-  // Load the kernel from the module
-  cout << "Load kernel \"sum\" from the module." << endl;
-  CUfunction kernel;
-  cudaError_enum err = cuModuleGetFunction(&kernel, myModule.handle(), "sum");
-  gpu.checkForCudaError(err);
-
-  // Allocate some memory for the result
-  cout << "Allocate memory on the device." << endl;
-  CUdeviceptr dest;
-  err = cuMemAlloc(&dest, sizeof(int));
-  gpu.checkForCudaError(err);
-
-  // Setup kernel parameters
-  int offset(0);
-  offset = gpu.align(offset, __alignof(CUdeviceptr));
-
-  // cuParamSetv is used for pointers...
-  err = cuParamSetv(kernel, offset, &dest, sizeof(CUdeviceptr));
-  gpu.checkForCudaError(err);
-  offset += sizeof(CUdeviceptr);
-
-  offset = gpu.align(offset, __alignof(int));
-  err = cuParamSeti(kernel, offset, 4); // cuParamSeti is used for integers.
-  gpu.checkForCudaError(err);
-  offset += sizeof(int);
-  offset = gpu.align(offset, __alignof(int));
-  err = cuParamSeti(kernel, offset, 34);
-  gpu.checkForCudaError(err);
-  offset += sizeof(int);
-  err = cuParamSetSize(kernel, offset);
-  gpu.checkForCudaError(err);
-
-  // Launch the grid
-  cout << "Launch the grid" << endl;
-  err = cuFuncSetBlockShape(kernel, 1, 1, 1);
-  gpu.checkForCudaError(err);
-  err = cuLaunchGrid(kernel, 1, 1);
-  gpu.checkForCudaError(err);
-
-  // Read the result off of the GPU
-  cout << "Read the result" << endl;
-  int result = 0;
-  err = cuMemcpyDtoH(&result, dest, sizeof(int));
-  gpu.checkForCudaError(err);
-
-  cout << "Sum of 4 and 34 is " << result << endl;
-
-  if (result == 38)
-    ut.passes("Sum of 4 and 34 is 38.");
-  else
-    ut.failure("Sum of 4 and 34 was incorrect.");
-
-  // deallocate memory, free the context.
-  cout << "deallocate device memory." << endl;
-  err = cuMemFree(dest);
-  gpu.checkForCudaError(err);
-
-  return;
-}
-
-//---------------------------------------------------------------------------//
-// vector_add
-//---------------------------------------------------------------------------//
-
-void vector_add(rtt_dsxx::ScalarUnitTest &ut) {
-  using namespace std;
-
-  cout << "\nStarting gpu_hello_driver_api::vector_add()...\n" << endl;
-
-  // Create a GPU_Device object.
-  // Initialize the CUDA library and sets device and context handles.
-  rtt_device::GPU_Device gpu;
-
-  // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu
-  rtt_device::GPU_Module myModule("vector_add.cubin");
-
-  // Host data
-  size_t len(1024);
-  size_t const threadsPerBlock(gpu.maxThreadsPerBlock());
-  size_t const blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;
-  vector<double> aH(len);
-  vector<double> bH(len);
-  vector<double> cH(len, 0.0);
-  vector<double> refH(len);
-  genTestData(aH, bH, refH);
-
-  // Load the kernel from the module
-  CUfunction kernel;
-  cudaError_enum err =
-      cuModuleGetFunction(&kernel, myModule.handle(), "vector_add");
-  gpu.checkForCudaError(err);
-
-  // Allocate some memory for the result
-  CUdeviceptr d_A, d_B, d_C;
-  err = cuMemAlloc(&d_A, len * sizeof(double));
-  gpu.checkForCudaError(err);
-  err = cuMemAlloc(&d_B, len * sizeof(double));
-  gpu.checkForCudaError(err);
-  err = cuMemAlloc(&d_C, len * sizeof(double));
-  gpu.checkForCudaError(err);
-
-  // Copy host data to device
-  err = cuMemcpyHtoD(d_A, &aH[0], len * sizeof(double));
-  gpu.checkForCudaError(err);
-  err = cuMemcpyHtoD(d_B, &bH[0], len * sizeof(double));
-  gpu.checkForCudaError(err);
-
-  // This is the function signature
-  void *args[] = {&d_A, &d_B, &d_C, &len};
-
-  // Execute the kernel
-  err = cuLaunchKernel(kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0,
-                       args, 0);
-  gpu.checkForCudaError(err);
-
-  // Copy result from device to host
-  err = cuMemcpyDtoH((void *)(&cH[0]), d_C, len * sizeof(double));
-  gpu.checkForCudaError(err);
-
-  // Free device memory
-  err = cuMemFree(d_A);
-  gpu.checkForCudaError(err);
-  err = cuMemFree(d_B);
-  gpu.checkForCudaError(err);
-  err = cuMemFree(d_C);
-  gpu.checkForCudaError(err);
-
-  // Check the result
-  if (rtt_dsxx::soft_equiv(cH.begin(), cH.end(), refH.begin(), refH.end()))
-    ut.passes("vector_add worked!");
-  else
-    ut.failure("vector_add failed.");
-
-  return;
-}
-
-//---------------------------------------------------------------------------//
-// vector_add_using_wrappers
-//---------------------------------------------------------------------------//
-
-void vector_add_using_wrappers(rtt_dsxx::ScalarUnitTest &ut) {
-  using namespace std;
-
-  cout << "\nStarting gpu_hello_driver_api::vector_add_using_wrappers()...\n"
-       << endl;
-
-  // Create a GPU_Device object.
-  // Initialize the CUDA library and sets device and context handles.
-  rtt_device::GPU_Device gpu;
-
-  // Load the module, must compile the kernel with nvcc -ptx -m32 kernel.cu
-  rtt_device::GPU_Module myModule("vector_add.cubin");
-
-  // Host data
-  size_t len(1024);
-  size_t const threadsPerBlock(gpu.maxThreadsPerBlock());
-  size_t const blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;
-  vector<double> aH(len);
-  vector<double> bH(len);
-  vector<double> cH(len, 0.0);
-  vector<double> refH(len);
-  genTestData(aH, bH, refH);
-
-  // Load the kernel from the module
-  CUfunction kernel = myModule.getModuleFunction("vector_add");
-
-  // Allocate some memory for the result
-  unsigned const nbytes = len * sizeof(double);
-  CUdeviceptr d_A = gpu.MemAlloc(nbytes);
-  CUdeviceptr d_B = gpu.MemAlloc(nbytes);
-  CUdeviceptr d_C = gpu.MemAlloc(nbytes);
-
-  // Copy host data to device
-  gpu.MemcpyHtoD(d_A, &aH[0], nbytes);
-  gpu.MemcpyHtoD(d_B, &bH[0], nbytes);
-
-  // This is the function signature
-  void *args[] = {&d_A, &d_B, &d_C, &len};
-
-  // Execute the kernel
-  cudaError_enum err = cuLaunchKernel(kernel, blocksPerGrid, 1, 1,
-                                      threadsPerBlock, 1, 1, 0, 0, args, 0);
-  gpu.checkForCudaError(err);
-
-  // Copy result from device to host
-  gpu.MemcpyDtoH((void *)(&cH[0]), d_C, nbytes);
-
-  // Free device memory
-  gpu.MemFree(d_A);
-  gpu.MemFree(d_B);
-  gpu.MemFree(d_C);
-
-  // Check the result
-  if (rtt_dsxx::soft_equiv(cH.begin(), cH.end(), refH.begin(), refH.end()))
-    ut.passes("vector_add worked!");
-  else
-    ut.failure("vector_add failed.");
-
-  return;
-}
-
-//---------------------------------------------------------------------------//
-// Main
-//---------------------------------------------------------------------------//
-
-int main(int argc, char *argv[]) {
-  using namespace std;
-
-  rtt_dsxx::ScalarUnitTest ut(argc, argv, rtt_dsxx::release);
-  try {
-    query_device(ut);
-    simple_add(ut);
-    vector_add(ut);
-    vector_add_using_wrappers(ut);
-  } catch (exception &err) {
-    cout << "ERROR: While testing gpu_hello_driver_api, " << err.what() << endl;
-    ut.numFails++;
-  } catch (...) {
-    cout << "ERROR: While testing gpu_hello_driver_api, "
-         << "An unknown exception was thrown." << endl;
-    ut.numFails++;
-  }
-  return ut.numFails;
-}
-
-//---------------------------------------------------------------------------//
-// end of gpu_hello_driver_api.cc
-//---------------------------------------------------------------------------//
diff --git a/src/device/test/gpu_hello_rt_api.cu b/src/device/test/gpu_hello_rt_api.cu
index c373cc554b..81343c8e1f 100644
--- a/src/device/test/gpu_hello_rt_api.cu
+++ b/src/device/test/gpu_hello_rt_api.cu
@@ -17,7 +17,9 @@
 #include <vector>
 
 // GPU kernels
-#include "device/test/vector_add.cu"
+#include "device/test/basic_kernels.hh"
+
+using namespace rtt_device_test;
 
 //----------------------------------------------------------------------------//
 // Tests
diff --git a/src/device/test/gpu_hello_rt_api.hh b/src/device/test/gpu_hello_rt_api.hh
deleted file mode 100644
index bee51c406a..0000000000
--- a/src/device/test/gpu_hello_rt_api.hh
+++ /dev/null
@@ -1,43 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/test/gpu_hello_rt_api.hh
- * \author Kelly (KT) Thompson
- * \date   Thu Oct 25 15:28:48 2011
- * \brief  Wrap the cuda_runtime_api.h header while preventing comiler
- *         warnings about vendor code.
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//---------------------------------------------------------------------------//
-
-// All this garbage suppresses warnings found in "cuda.h".
-// http://wiki.services.openoffice.org/wiki/Writing_warning-free_code#When_all_else_fails
-#if defined __GNUC__
-#pragma GCC system_header
-// Intel defines __GNUC__ by default
-#ifdef __INTEL_COMPILER
-#pragma warning push
-#endif
-#elif defined __SUNPRO_CC
-#pragma disable_warn
-#elif defined _MSC_VER
-#pragma warning(push, 1)
-#endif
-
-#include <cuda_runtime_api.h>
-
-#if defined __GNUC__
-#pragma GCC system_header
-#ifdef __INTEL_COMPILER
-#pragma warning pop
-#endif
-#elif defined __SUNPRO_CC
-#pragma enable_warn
-#elif defined _MSC_VER
-#pragma warning(pop)
-#endif
-
-// #define Error(format, args...) (error_and_exit)("%s:%d: " format, __FILE__, __LINE__, ##args)
-
-//---------------------------------------------------------------------------//
-// end of gpu_hello_rt_api.hh
-//---------------------------------------------------------------------------//
diff --git a/src/device/test/gpu_kernel.cu b/src/device/test/gpu_kernel.cu
deleted file mode 100644
index 7793b837a3..0000000000
--- a/src/device/test/gpu_kernel.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/test/gpu_kernel.cu
- * \author Kelly Thompson
- * \date
- * \brief  Small kernel code for testing GPU Device framework.
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//---------------------------------------------------------------------------//
-
-extern "C" {
-__global__ void sum(int *dest, int a, int b) {
-  // Assuming a single thread, 1x1x1 block, 1x1 grid
-  *dest = a + b;
-}
-}
-
-//---------------------------------------------------------------------------//
-// end of gpu_kernel.cu
-//---------------------------------------------------------------------------//
diff --git a/src/device/test/vector_add.cu b/src/device/test/vector_add.cu
deleted file mode 100644
index 477e2e4c4f..0000000000
--- a/src/device/test/vector_add.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-//-----------------------------------*-C++-*----------------------------------//
-/*!
- * \file   device/test/vector_add.cu
- * \author Kelly Thompson
- * \date
- * \brief  Small kernel code for testing GPU Device framework.
- * \note   Copyright (C) 2016-2019 Triad National Security, LLC.
- *         All rights reserved. */
-//----------------------------------------------------------------------------//
-
-__global__ void vector_add(double const *A_dev, double const *B_dev,
-                           double *C_dev, int const N) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  /* if(i%512==0)
-         *   printf("index %d\n",i); */
-  if (i < N)
-    C_dev[i] = A_dev[i] + B_dev[i];
-}
-
-//----------------------------------------------------------------------------//
-// end of vector_add.cu
-//----------------------------------------------------------------------------//