From d94bdfd060c8c54379d01c21b8386492f36c9fd1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 19 Jan 2022 10:28:10 -0500 Subject: [PATCH] Replace use of custom CUDA bindings with CUDA-Python (#930) This PR removes many of the custom CUDA bindings we wrote in RMM to support calls to the driver/runtime APIs from Python in downstream libraries (cudf, cuml, cugraph). We should now use [CUDA Python](https://github.com/NVIDIA/CUDA-Python) instead. However, the module `rmm._cuda.gpu` is not being removed. It _has_ been converted from an extension module (`.pyx`) to a regular `.py` module. This module contains high-level wrappers around raw CUDA bindings, with some niceties like converting errors to exceptions with the appropriate error message. Reimplementing that functionality in each downstream library would be a bad idea. When CUDA Python rolls its own higher-level API, we can remove the `gpu` module as well. One API change worth mentioning here is to the function `rmm._cuda.gpu.getDeviceAttribute`. Previously, the API accepted a `cudaDeviceAttr`, a type defined as part of RMM's custom CUDA bindings. The API has now changed to accept a `cudaDeviceAttr` defined in CUDA-Python. This requires changes in downstream libraries that use this API. I am marking this PR _non-breaking_ as it does not affect the user-facing API. It does cause breakages in downstream libraries that are currently relying on internal APIs (from the `rmm._cuda` module). Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) - https://github.com/jakirkham URL: https://github.com/rapidsai/rmm/pull/930 --- conda/environments/rmm_dev_cuda10.2.yml | 21 - conda/environments/rmm_dev_cuda11.0.yml | 21 - ..._dev_cuda10.1.yml => rmm_dev_cuda11.5.yml} | 3 +- conda/recipes/rmm/meta.yaml | 2 + python/rmm/_cuda/10.1/gpu.pxi | 396 ----------------- python/rmm/_cuda/10.2/gpu.pxi | 400 ----------------- python/rmm/_cuda/11.x/gpu.pxi | 406 ------------------ python/rmm/_cuda/{gpu.pyx => gpu.py} | 111 ++--- python/rmm/_cuda/stream.pxd | 2 +- python/rmm/_cuda/stream.pyx | 2 +- python/rmm/_lib/cuda_stream.pxd | 2 +- python/rmm/_lib/cuda_stream.pyx | 1 + python/rmm/_lib/cuda_stream_view.pxd | 3 +- python/rmm/_lib/device_buffer.pyx | 19 +- python/rmm/_lib/lib.pxd | 20 - python/rmm/_lib/memory_resource.pyx | 6 +- python/setup.py | 40 +- 17 files changed, 57 insertions(+), 1398 deletions(-) delete mode 100644 conda/environments/rmm_dev_cuda10.2.yml delete mode 100644 conda/environments/rmm_dev_cuda11.0.yml rename conda/environments/{rmm_dev_cuda10.1.yml => rmm_dev_cuda11.5.yml} (86%) delete mode 100644 python/rmm/_cuda/10.1/gpu.pxi delete mode 100644 python/rmm/_cuda/10.2/gpu.pxi delete mode 100644 python/rmm/_cuda/11.x/gpu.pxi rename python/rmm/_cuda/{gpu.pyx => gpu.py} (55%) diff --git a/conda/environments/rmm_dev_cuda10.2.yml b/conda/environments/rmm_dev_cuda10.2.yml deleted file mode 100644 index 348418014..000000000 --- a/conda/environments/rmm_dev_cuda10.2.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: rmm_dev -channels: -- rapidsai -- conda-forge -dependencies: -- clang=11.1.0 -- clang-tools=11.1.0 -- cmake>=3.20.1 -- cmake-format=0.6.11 -- flake8=3.8.3 -- black=19.10 -- isort=5.6.4 -- python>=3.7,<3.9 -- numba>=0.49 -- numpy -- cffi>=1.10.0 -- pytest -- cudatoolkit=10.2 -- spdlog>=1.8.5,<1.9 -- cython>=0.29,<0.30 -- gcovr>=5.0 diff --git a/conda/environments/rmm_dev_cuda11.0.yml b/conda/environments/rmm_dev_cuda11.0.yml deleted file mode 100644 index 8fbe42e4b..000000000 --- a/conda/environments/rmm_dev_cuda11.0.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: rmm_dev -channels: -- rapidsai -- conda-forge -dependencies: -- clang=11.1.0 -- clang-tools=11.1.0 -- cmake>=3.20.1 -- cmake-format=0.6.11 -- flake8=3.8.3 -- black=19.10 -- isort=5.6.4 -- python>=3.7,<3.9 -- numba>=0.49 -- numpy -- cffi>=1.10.0 -- pytest -- cudatoolkit=11.0 -- spdlog>=1.8.5,<1.9 -- cython>=0.29,<0.30 -- gcovr>=5.0 diff --git a/conda/environments/rmm_dev_cuda10.1.yml b/conda/environments/rmm_dev_cuda11.5.yml similarity index 86% rename from conda/environments/rmm_dev_cuda10.1.yml rename to conda/environments/rmm_dev_cuda11.5.yml index caccd9541..549aca8d5 100644 --- a/conda/environments/rmm_dev_cuda10.1.yml +++ b/conda/environments/rmm_dev_cuda11.5.yml @@ -15,7 +15,8 @@ dependencies: - numpy - cffi>=1.10.0 - pytest -- cudatoolkit=10.1 +- cudatoolkit=11.5 - spdlog>=1.8.5,<1.9 - cython>=0.29,<0.30 - gcovr>=5.0 +- cuda-python>=11.5,<12.0 diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml index a38da35fd..ce1ecec07 100644 --- a/conda/recipes/rmm/meta.yaml +++ b/conda/recipes/rmm/meta.yaml @@ -30,6 +30,7 @@ requirements: - cython >=0.29,<0.30 - spdlog>=1.8.5,<2.0.0a0 - cudatoolkit {{ cuda_version }}.* + - cuda-python >=11.5,<12.0 run: - python - numba >=0.49 @@ -39,6 +40,7 @@ requirements: {% else %} - {{ pin_compatible('cudatoolkit', upper_bound='11.2', lower_bound='11.0') }} # cudatoolkit >=11.0,<11.2 {% endif %} + - cuda-python >=11.5,<12.0 test: imports: diff --git a/python/rmm/_cuda/10.1/gpu.pxi b/python/rmm/_cuda/10.1/gpu.pxi deleted file mode 100644 index ab6569711..000000000 --- a/python/rmm/_cuda/10.1/gpu.pxi +++ /dev/null @@ -1,396 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# For CUDA 10.1 - -cdef extern from "cuda.h" nogil: - cpdef enum cudaDeviceAttr: - cudaDevAttrMaxThreadsPerBlock = 1 - cudaDevAttrMaxBlockDimX = 2 - cudaDevAttrMaxBlockDimY = 3 - cudaDevAttrMaxBlockDimZ = 4 - cudaDevAttrMaxGridDimX = 5 - cudaDevAttrMaxGridDimY = 6 - cudaDevAttrMaxGridDimZ = 7 - cudaDevAttrMaxSharedMemoryPerBlock = 8 - cudaDevAttrTotalConstantMemory = 9 - cudaDevAttrWarpSize = 10 - cudaDevAttrMaxPitch = 11 - cudaDevAttrMaxRegistersPerBlock = 12 - cudaDevAttrClockRate = 13 - cudaDevAttrTextureAlignment = 14 - cudaDevAttrGpuOverlap = 15 - cudaDevAttrMultiProcessorCount = 16 - cudaDevAttrKernelExecTimeout = 17 - cudaDevAttrIntegrated = 18 - cudaDevAttrCanMapHostMemory = 19 - cudaDevAttrComputeMode = 20 - cudaDevAttrMaxTexture1DWidth = 21 - cudaDevAttrMaxTexture2DWidth = 22 - cudaDevAttrMaxTexture2DHeight = 23 - cudaDevAttrMaxTexture3DWidth = 24 - cudaDevAttrMaxTexture3DHeight = 25 - cudaDevAttrMaxTexture3DDepth = 26 - cudaDevAttrMaxTexture2DLayeredWidth = 27 - cudaDevAttrMaxTexture2DLayeredHeight = 28 - cudaDevAttrMaxTexture2DLayeredLayers = 29 - cudaDevAttrSurfaceAlignment = 30 - cudaDevAttrConcurrentKernels = 31 - cudaDevAttrEccEnabled = 32 - cudaDevAttrPciBusId = 33 - cudaDevAttrPciDeviceId = 34 - cudaDevAttrTccDriver = 35 - cudaDevAttrMemoryClockRate = 36 - cudaDevAttrGlobalMemoryBusWidth = 37 - cudaDevAttrL2CacheSize = 38 - cudaDevAttrMaxThreadsPerMultiProcessor = 39 - cudaDevAttrAsyncEngineCount = 40 - cudaDevAttrUnifiedAddressing = 41 - cudaDevAttrMaxTexture1DLayeredWidth = 42 - cudaDevAttrMaxTexture1DLayeredLayers = 43 - cudaDevAttrMaxTexture2DGatherWidth = 45 - cudaDevAttrMaxTexture2DGatherHeight = 46 - cudaDevAttrMaxTexture3DWidthAlt = 47 - cudaDevAttrMaxTexture3DHeightAlt = 48 - cudaDevAttrMaxTexture3DDepthAlt = 49 - cudaDevAttrPciDomainId = 50 - cudaDevAttrTexturePitchAlignment = 51 - cudaDevAttrMaxTextureCubemapWidth = 52 - cudaDevAttrMaxTextureCubemapLayeredWidth = 53 - cudaDevAttrMaxTextureCubemapLayeredLayers = 54 - cudaDevAttrMaxSurface1DWidth = 55 - cudaDevAttrMaxSurface2DWidth = 56 - cudaDevAttrMaxSurface2DHeight = 57 - cudaDevAttrMaxSurface3DWidth = 58 - cudaDevAttrMaxSurface3DHeight = 59 - cudaDevAttrMaxSurface3DDepth = 60 - cudaDevAttrMaxSurface1DLayeredWidth = 61 - cudaDevAttrMaxSurface1DLayeredLayers = 62 - cudaDevAttrMaxSurface2DLayeredWidth = 63 - cudaDevAttrMaxSurface2DLayeredHeight = 64 - cudaDevAttrMaxSurface2DLayeredLayers = 65 - cudaDevAttrMaxSurfaceCubemapWidth = 66 - cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67 - cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68 - cudaDevAttrMaxTexture1DLinearWidth = 69 - cudaDevAttrMaxTexture2DLinearWidth = 70 - cudaDevAttrMaxTexture2DLinearHeight = 71 - cudaDevAttrMaxTexture2DLinearPitch = 72 - cudaDevAttrMaxTexture2DMipmappedWidth = 73 - cudaDevAttrMaxTexture2DMipmappedHeight = 74 - cudaDevAttrComputeCapabilityMajor = 75 - cudaDevAttrComputeCapabilityMinor = 76 - cudaDevAttrMaxTexture1DMipmappedWidth = 77 - cudaDevAttrStreamPrioritiesSupported = 78 - cudaDevAttrGlobalL1CacheSupported = 79 - cudaDevAttrLocalL1CacheSupported = 80 - cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81 - cudaDevAttrMaxRegistersPerMultiprocessor = 82 - cudaDevAttrManagedMemory = 83 - cudaDevAttrIsMultiGpuBoard = 84 - cudaDevAttrMultiGpuBoardGroupID = 85 - cudaDevAttrHostNativeAtomicSupported = 86 - cudaDevAttrSingleToDoublePrecisionPerfRatio = 87 - cudaDevAttrPageableMemoryAccess = 88 - cudaDevAttrConcurrentManagedAccess = 89 - cudaDevAttrComputePreemptionSupported = 90 - cudaDevAttrCanUseHostPointerForRegisteredMem = 91 - cudaDevAttrReserved92 = 92 - cudaDevAttrReserved93 = 93 - cudaDevAttrReserved94 = 94 - cudaDevAttrCooperativeLaunch = 95 - cudaDevAttrCooperativeMultiDeviceLaunch = 96 - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 - cudaDevAttrCanFlushRemoteWrites = 98 - cudaDevAttrHostRegisterSupported = 99 - cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100 - cudaDevAttrDirectManagedMemAccessFromHost = 101 - - cpdef enum cudaError: - cudaSuccess = 0 - cudaErrorInvalidValue = 1 - cudaErrorMemoryAllocation = 2 - cudaErrorInitializationError = 3 - cudaErrorCudartUnloading = 4 - cudaErrorProfilerDisabled = 5 - cudaErrorProfilerNotInitialized = 6 - cudaErrorProfilerAlreadyStarted = 7 - cudaErrorProfilerAlreadyStopped = 8 - cudaErrorInvalidConfiguration = 9 - cudaErrorInvalidPitchValue = 12 - cudaErrorInvalidSymbol = 13 - cudaErrorInvalidHostPointer = 16 - cudaErrorInvalidDevicePointer = 17 - cudaErrorInvalidTexture = 18 - cudaErrorInvalidTextureBinding = 19 - cudaErrorInvalidChannelDescriptor = 20 - cudaErrorInvalidMemcpyDirection = 21 - cudaErrorAddressOfConstant = 22 - cudaErrorTextureFetchFailed = 23 - cudaErrorTextureNotBound = 24 - cudaErrorSynchronizationError = 25 - cudaErrorInvalidFilterSetting = 26 - cudaErrorInvalidNormSetting = 27 - cudaErrorMixedDeviceExecution = 28 - cudaErrorNotYetImplemented = 31 - cudaErrorMemoryValueTooLarge = 32 - cudaErrorInsufficientDriver = 35 - cudaErrorInvalidSurface = 37 - cudaErrorDuplicateVariableName = 43 - cudaErrorDuplicateTextureName = 44 - cudaErrorDuplicateSurfaceName = 45 - cudaErrorDevicesUnavailable = 46 - cudaErrorIncompatibleDriverContext = 49 - cudaErrorMissingConfiguration = 52 - cudaErrorPriorLaunchFailure = 53 - cudaErrorLaunchMaxDepthExceeded = 65 - cudaErrorLaunchFileScopedTex = 66 - cudaErrorLaunchFileScopedSurf = 67 - cudaErrorSyncDepthExceeded = 68 - cudaErrorLaunchPendingCountExceeded = 69 - cudaErrorInvalidDeviceFunction = 98 - cudaErrorNoDevice = 100 - cudaErrorInvalidDevice = 101 - cudaErrorStartupFailure = 127 - cudaErrorInvalidKernelImage = 200 - cudaErrorDeviceUninitilialized = 201 - cudaErrorMapBufferObjectFailed = 205 - cudaErrorUnmapBufferObjectFailed = 206 - cudaErrorArrayIsMapped = 207 - cudaErrorAlreadyMapped = 208 - cudaErrorNoKernelImageForDevice = 209 - cudaErrorAlreadyAcquired = 210 - cudaErrorNotMapped = 211 - cudaErrorNotMappedAsArray = 212 - cudaErrorNotMappedAsPointer = 213 - cudaErrorECCUncorrectable = 214 - cudaErrorUnsupportedLimit = 215 - cudaErrorDeviceAlreadyInUse = 216 - cudaErrorPeerAccessUnsupported = 217 - cudaErrorInvalidPtx = 218 - cudaErrorInvalidGraphicsContext = 219 - cudaErrorNvlinkUncorrectable = 220 - cudaErrorJitCompilerNotFound = 221 - cudaErrorInvalidSource = 300 - cudaErrorFileNotFound = 301 - cudaErrorSharedObjectSymbolNotFound = 302 - cudaErrorSharedObjectInitFailed = 303 - cudaErrorOperatingSystem = 304 - cudaErrorInvalidResourceHandle = 400 - cudaErrorIllegalState = 401 - cudaErrorSymbolNotFound = 500 - cudaErrorNotReady = 600 - cudaErrorIllegalAddress = 700 - cudaErrorLaunchOutOfResources = 701 - cudaErrorLaunchTimeout = 702 - cudaErrorLaunchIncompatibleTexturing = 703 - cudaErrorPeerAccessAlreadyEnabled = 704 - cudaErrorPeerAccessNotEnabled = 705 - cudaErrorSetOnActiveProcess = 708 - cudaErrorContextIsDestroyed = 709 - cudaErrorAssert = 710 - cudaErrorTooManyPeers = 711 - cudaErrorHostMemoryAlreadyRegistered = 712 - cudaErrorHostMemoryNotRegistered = 713 - cudaErrorHardwareStackError = 714 - cudaErrorIllegalInstruction = 715 - cudaErrorMisalignedAddress = 716 - cudaErrorInvalidAddressSpace = 717 - cudaErrorInvalidPc = 718 - cudaErrorLaunchFailure = 719 - cudaErrorCooperativeLaunchTooLarge = 720 - cudaErrorNotPermitted = 800 - cudaErrorNotSupported = 801 - cudaErrorSystemNotReady = 802 - cudaErrorSystemDriverMismatch = 803 - cudaErrorCompatNotSupportedOnDevice = 804 - cudaErrorStreamCaptureUnsupported = 900 - cudaErrorStreamCaptureInvalidated = 901 - cudaErrorStreamCaptureMerge = 902 - cudaErrorStreamCaptureUnmatched = 903 - cudaErrorStreamCaptureUnjoined = 904 - cudaErrorStreamCaptureIsolation = 905 - cudaErrorStreamCaptureImplicit = 906 - cudaErrorCapturedEvent = 907 - cudaErrorStreamCaptureWrongThread = 908 - cudaErrorUnknown = 999 - cudaErrorApiFailureBase = 10000 - - ctypedef cudaError cudaError_t - - ctypedef enum CUresult: - CUDA_SUCCESS = 0 - CUDA_ERROR_INVALID_VALUE = 1 - CUDA_ERROR_OUT_OF_MEMORY = 2 - CUDA_ERROR_NOT_INITIALIZED = 3 - CUDA_ERROR_DEINITIALIZED = 4 - CUDA_ERROR_PROFILER_DISABLED = 5 - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6 - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7 - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8 - CUDA_ERROR_NO_DEVICE = 100 - CUDA_ERROR_INVALID_DEVICE = 101 - CUDA_ERROR_INVALID_IMAGE = 200 - CUDA_ERROR_INVALID_CONTEXT = 201 - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202 - CUDA_ERROR_MAP_FAILED = 205 - CUDA_ERROR_UNMAP_FAILED = 206 - CUDA_ERROR_ARRAY_IS_MAPPED = 207 - CUDA_ERROR_ALREADY_MAPPED = 208 - CUDA_ERROR_NO_BINARY_FOR_GPU = 209 - CUDA_ERROR_ALREADY_ACQUIRED = 210 - CUDA_ERROR_NOT_MAPPED = 211 - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212 - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213 - CUDA_ERROR_ECC_UNCORRECTABLE = 214 - CUDA_ERROR_UNSUPPORTED_LIMIT = 215 - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216 - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217 - CUDA_ERROR_INVALID_PTX = 218 - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219 - CUDA_ERROR_NVLINK_UNCORRECTABLE = 220 - CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221 - CUDA_ERROR_INVALID_SOURCE = 300 - CUDA_ERROR_FILE_NOT_FOUND = 301 - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302 - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303 - CUDA_ERROR_OPERATING_SYSTEM = 304 - CUDA_ERROR_INVALID_HANDLE = 400 - CUDA_ERROR_ILLEGAL_STATE = 401 - CUDA_ERROR_NOT_FOUND = 500 - CUDA_ERROR_NOT_READY = 600 - CUDA_ERROR_ILLEGAL_ADDRESS = 700 - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701 - CUDA_ERROR_LAUNCH_TIMEOUT = 702 - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703 - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704 - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705 - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708 - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709 - CUDA_ERROR_ASSERT = 710 - CUDA_ERROR_TOO_MANY_PEERS = 711 - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712 - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713 - CUDA_ERROR_HARDWARE_STACK_ERROR = 714 - CUDA_ERROR_ILLEGAL_INSTRUCTION = 715 - CUDA_ERROR_MISALIGNED_ADDRESS = 716 - CUDA_ERROR_INVALID_ADDRESS_SPACE = 717 - CUDA_ERROR_INVALID_PC = 718 - CUDA_ERROR_LAUNCH_FAILED = 719 - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720 - CUDA_ERROR_NOT_PERMITTED = 800 - CUDA_ERROR_NOT_SUPPORTED = 801 - CUDA_ERROR_SYSTEM_NOT_READY = 802 - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803 - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804 - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 - CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903 - CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904 - CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905 - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906 - CUDA_ERROR_CAPTURED_EVENT = 907 - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 - CUDA_ERROR_UNKNOWN = 999 - - ctypedef struct CUuuid_st: - char bytes[16] - - ctypedef CUuuid_st cudaUUID_t - - ctypedef struct cudaDeviceProp: - int ECCEnabled - int asyncEngineCount - int canMapHostMemory - int canUseHostPointerForRegisteredMem - int clockRate - int computeMode - int computePreemptionSupported - int concurrentKernels - int concurrentManagedAccess - int cooperativeLaunch - int cooperativeMultiDeviceLaunch - int deviceOverlap - int directManagedMemAccessFromHost - int globalL1CacheSupported - int hostNativeAtomicSupported - int integrated - int isMultiGpuBoard - int kernelExecTimeoutEnabled - int l2CacheSize - int localL1CacheSupported - char luid[8] - unsigned int luidDeviceNodeMask - int major - int managedMemory - int maxGridSize[3] - int maxSurface1D - int maxSurface1DLayered[2] - int maxSurface2D[2] - int maxSurface2DLayered[3] - int maxSurface3D[3] - int maxSurfaceCubemap - int maxSurfaceCubemapLayered[2] - int maxTexture1D - int maxTexture1DLayered[2] - int maxTexture1DLinear - int maxTexture1DMipmap - int maxTexture2D[2] - int maxTexture2DGather[2] - int maxTexture2DLayered[3] - int maxTexture2DLinear[3] - int maxTexture2DMipmap[2] - int maxTexture3D[3] - int maxTexture3DAlt[3] - int maxTextureCubemap - int maxTextureCubemapLayered[2] - int maxThreadsDim[3] - int maxThreadsPerBlock - int maxThreadsPerMultiProcessor - size_t memPitch - int memoryBusWidth - int memoryClockRate - int minor - int multiGpuBoardGroupID - int multiProcessorCount - char name[256] - int pageableMemoryAccess - int pageableMemoryAccessUsesHostPageTables - int pciBusID - int pciDeviceID - int pciDomainID - int regsPerBlock - int regsPerMultiprocessor - size_t sharedMemPerBlock - size_t sharedMemPerBlockOptin - size_t sharedMemPerMultiprocessor - int singleToDoublePrecisionPerfRatio - int streamPrioritiesSupported - size_t surfaceAlignment - int tccDriver - size_t textureAlignment - size_t texturePitchAlignment - size_t totalConstMem - size_t totalGlobalMem - int unifiedAddressing - cudaUUID_t uuid - int warpSize - - CUresult cuDeviceGetName(char* name, int length, int device) - - CUresult cuGetErrorName(CUresult error, const char** pStr) - CUresult cuGetErrorString(CUresult error, const char** pStr) - -cdef extern from "cuda_runtime_api.h" nogil: - - cudaError_t cudaDriverGetVersion(int* driverVersion) - cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) - cudaError_t cudaGetDeviceCount(int* count) - cudaError_t cudaGetDevice(int* device) - cudaError_t cudaDeviceGetAttribute(int* value, - cudaDeviceAttr attr, - int device) - cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) - cudaError_t cudaSetDevice(int device) - - const char* cudaGetErrorString(cudaError_t error) - const char* cudaGetErrorName(cudaError_t error) diff --git a/python/rmm/_cuda/10.2/gpu.pxi b/python/rmm/_cuda/10.2/gpu.pxi deleted file mode 100644 index a3a19b642..000000000 --- a/python/rmm/_cuda/10.2/gpu.pxi +++ /dev/null @@ -1,400 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# For CUDA 10.2 - -cdef extern from "cuda.h" nogil: - cpdef enum cudaDeviceAttr: - cudaDevAttrMaxThreadsPerBlock = 1 - cudaDevAttrMaxBlockDimX = 2 - cudaDevAttrMaxBlockDimY = 3 - cudaDevAttrMaxBlockDimZ = 4 - cudaDevAttrMaxGridDimX = 5 - cudaDevAttrMaxGridDimY = 6 - cudaDevAttrMaxGridDimZ = 7 - cudaDevAttrMaxSharedMemoryPerBlock = 8 - cudaDevAttrTotalConstantMemory = 9 - cudaDevAttrWarpSize = 10 - cudaDevAttrMaxPitch = 11 - cudaDevAttrMaxRegistersPerBlock = 12 - cudaDevAttrClockRate = 13 - cudaDevAttrTextureAlignment = 14 - cudaDevAttrGpuOverlap = 15 - cudaDevAttrMultiProcessorCount = 16 - cudaDevAttrKernelExecTimeout = 17 - cudaDevAttrIntegrated = 18 - cudaDevAttrCanMapHostMemory = 19 - cudaDevAttrComputeMode = 20 - cudaDevAttrMaxTexture1DWidth = 21 - cudaDevAttrMaxTexture2DWidth = 22 - cudaDevAttrMaxTexture2DHeight = 23 - cudaDevAttrMaxTexture3DWidth = 24 - cudaDevAttrMaxTexture3DHeight = 25 - cudaDevAttrMaxTexture3DDepth = 26 - cudaDevAttrMaxTexture2DLayeredWidth = 27 - cudaDevAttrMaxTexture2DLayeredHeight = 28 - cudaDevAttrMaxTexture2DLayeredLayers = 29 - cudaDevAttrSurfaceAlignment = 30 - cudaDevAttrConcurrentKernels = 31 - cudaDevAttrEccEnabled = 32 - cudaDevAttrPciBusId = 33 - cudaDevAttrPciDeviceId = 34 - cudaDevAttrTccDriver = 35 - cudaDevAttrMemoryClockRate = 36 - cudaDevAttrGlobalMemoryBusWidth = 37 - cudaDevAttrL2CacheSize = 38 - cudaDevAttrMaxThreadsPerMultiProcessor = 39 - cudaDevAttrAsyncEngineCount = 40 - cudaDevAttrUnifiedAddressing = 41 - cudaDevAttrMaxTexture1DLayeredWidth = 42 - cudaDevAttrMaxTexture1DLayeredLayers = 43 - cudaDevAttrMaxTexture2DGatherWidth = 45 - cudaDevAttrMaxTexture2DGatherHeight = 46 - cudaDevAttrMaxTexture3DWidthAlt = 47 - cudaDevAttrMaxTexture3DHeightAlt = 48 - cudaDevAttrMaxTexture3DDepthAlt = 49 - cudaDevAttrPciDomainId = 50 - cudaDevAttrTexturePitchAlignment = 51 - cudaDevAttrMaxTextureCubemapWidth = 52 - cudaDevAttrMaxTextureCubemapLayeredWidth = 53 - cudaDevAttrMaxTextureCubemapLayeredLayers = 54 - cudaDevAttrMaxSurface1DWidth = 55 - cudaDevAttrMaxSurface2DWidth = 56 - cudaDevAttrMaxSurface2DHeight = 57 - cudaDevAttrMaxSurface3DWidth = 58 - cudaDevAttrMaxSurface3DHeight = 59 - cudaDevAttrMaxSurface3DDepth = 60 - cudaDevAttrMaxSurface1DLayeredWidth = 61 - cudaDevAttrMaxSurface1DLayeredLayers = 62 - cudaDevAttrMaxSurface2DLayeredWidth = 63 - cudaDevAttrMaxSurface2DLayeredHeight = 64 - cudaDevAttrMaxSurface2DLayeredLayers = 65 - cudaDevAttrMaxSurfaceCubemapWidth = 66 - cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67 - cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68 - cudaDevAttrMaxTexture1DLinearWidth = 69 - cudaDevAttrMaxTexture2DLinearWidth = 70 - cudaDevAttrMaxTexture2DLinearHeight = 71 - cudaDevAttrMaxTexture2DLinearPitch = 72 - cudaDevAttrMaxTexture2DMipmappedWidth = 73 - cudaDevAttrMaxTexture2DMipmappedHeight = 74 - cudaDevAttrComputeCapabilityMajor = 75 - cudaDevAttrComputeCapabilityMinor = 76 - cudaDevAttrMaxTexture1DMipmappedWidth = 77 - cudaDevAttrStreamPrioritiesSupported = 78 - cudaDevAttrGlobalL1CacheSupported = 79 - cudaDevAttrLocalL1CacheSupported = 80 - cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81 - cudaDevAttrMaxRegistersPerMultiprocessor = 82 - cudaDevAttrManagedMemory = 83 - cudaDevAttrIsMultiGpuBoard = 84 - cudaDevAttrMultiGpuBoardGroupID = 85 - cudaDevAttrHostNativeAtomicSupported = 86 - cudaDevAttrSingleToDoublePrecisionPerfRatio = 87 - cudaDevAttrPageableMemoryAccess = 88 - cudaDevAttrConcurrentManagedAccess = 89 - cudaDevAttrComputePreemptionSupported = 90 - cudaDevAttrCanUseHostPointerForRegisteredMem = 91 - cudaDevAttrReserved92 = 92 - cudaDevAttrReserved93 = 93 - cudaDevAttrReserved94 = 94 - cudaDevAttrCooperativeLaunch = 95 - cudaDevAttrCooperativeMultiDeviceLaunch = 96 - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 - cudaDevAttrCanFlushRemoteWrites = 98 - cudaDevAttrHostRegisterSupported = 99 - cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100 - cudaDevAttrDirectManagedMemAccessFromHost = 101 - - cpdef enum cudaError: - cudaSuccess = 0 - cudaErrorInvalidValue = 1 - cudaErrorMemoryAllocation = 2 - cudaErrorInitializationError = 3 - cudaErrorCudartUnloading = 4 - cudaErrorProfilerDisabled = 5 - cudaErrorProfilerNotInitialized = 6 - cudaErrorProfilerAlreadyStarted = 7 - cudaErrorProfilerAlreadyStopped = 8 - cudaErrorInvalidConfiguration = 9 - cudaErrorInvalidPitchValue = 12 - cudaErrorInvalidSymbol = 13 - cudaErrorInvalidHostPointer = 16 - cudaErrorInvalidDevicePointer = 17 - cudaErrorInvalidTexture = 18 - cudaErrorInvalidTextureBinding = 19 - cudaErrorInvalidChannelDescriptor = 20 - cudaErrorInvalidMemcpyDirection = 21 - cudaErrorAddressOfConstant = 22 - cudaErrorTextureFetchFailed = 23 - cudaErrorTextureNotBound = 24 - cudaErrorSynchronizationError = 25 - cudaErrorInvalidFilterSetting = 26 - cudaErrorInvalidNormSetting = 27 - cudaErrorMixedDeviceExecution = 28 - cudaErrorNotYetImplemented = 31 - cudaErrorMemoryValueTooLarge = 32 - cudaErrorInsufficientDriver = 35 - cudaErrorInvalidSurface = 37 - cudaErrorDuplicateVariableName = 43 - cudaErrorDuplicateTextureName = 44 - cudaErrorDuplicateSurfaceName = 45 - cudaErrorDevicesUnavailable = 46 - cudaErrorIncompatibleDriverContext = 49 - cudaErrorMissingConfiguration = 52 - cudaErrorPriorLaunchFailure = 53 - cudaErrorLaunchMaxDepthExceeded = 65 - cudaErrorLaunchFileScopedTex = 66 - cudaErrorLaunchFileScopedSurf = 67 - cudaErrorSyncDepthExceeded = 68 - cudaErrorLaunchPendingCountExceeded = 69 - cudaErrorInvalidDeviceFunction = 98 - cudaErrorNoDevice = 100 - cudaErrorInvalidDevice = 101 - cudaErrorStartupFailure = 127 - cudaErrorInvalidKernelImage = 200 - cudaErrorDeviceUninitialized = 201 - cudaErrorMapBufferObjectFailed = 205 - cudaErrorUnmapBufferObjectFailed = 206 - cudaErrorArrayIsMapped = 207 - cudaErrorAlreadyMapped = 208 - cudaErrorNoKernelImageForDevice = 209 - cudaErrorAlreadyAcquired = 210 - cudaErrorNotMapped = 211 - cudaErrorNotMappedAsArray = 212 - cudaErrorNotMappedAsPointer = 213 - cudaErrorECCUncorrectable = 214 - cudaErrorUnsupportedLimit = 215 - cudaErrorDeviceAlreadyInUse = 216 - cudaErrorPeerAccessUnsupported = 217 - cudaErrorInvalidPtx = 218 - cudaErrorInvalidGraphicsContext = 219 - cudaErrorNvlinkUncorrectable = 220 - cudaErrorJitCompilerNotFound = 221 - cudaErrorInvalidSource = 300 - cudaErrorFileNotFound = 301 - cudaErrorSharedObjectSymbolNotFound = 302 - cudaErrorSharedObjectInitFailed = 303 - cudaErrorOperatingSystem = 304 - cudaErrorInvalidResourceHandle = 400 - cudaErrorIllegalState = 401 - cudaErrorSymbolNotFound = 500 - cudaErrorNotReady = 600 - cudaErrorIllegalAddress = 700 - cudaErrorLaunchOutOfResources = 701 - cudaErrorLaunchTimeout = 702 - cudaErrorLaunchIncompatibleTexturing = 703 - cudaErrorPeerAccessAlreadyEnabled = 704 - cudaErrorPeerAccessNotEnabled = 705 - cudaErrorSetOnActiveProcess = 708 - cudaErrorContextIsDestroyed = 709 - cudaErrorAssert = 710 - cudaErrorTooManyPeers = 711 - cudaErrorHostMemoryAlreadyRegistered = 712 - cudaErrorHostMemoryNotRegistered = 713 - cudaErrorHardwareStackError = 714 - cudaErrorIllegalInstruction = 715 - cudaErrorMisalignedAddress = 716 - cudaErrorInvalidAddressSpace = 717 - cudaErrorInvalidPc = 718 - cudaErrorLaunchFailure = 719 - cudaErrorCooperativeLaunchTooLarge = 720 - cudaErrorNotPermitted = 800 - cudaErrorNotSupported = 801 - cudaErrorSystemNotReady = 802 - cudaErrorSystemDriverMismatch = 803 - cudaErrorCompatNotSupportedOnDevice = 804 - cudaErrorStreamCaptureUnsupported = 900 - cudaErrorStreamCaptureInvalidated = 901 - cudaErrorStreamCaptureMerge = 902 - cudaErrorStreamCaptureUnmatched = 903 - cudaErrorStreamCaptureUnjoined = 904 - cudaErrorStreamCaptureIsolation = 905 - cudaErrorStreamCaptureImplicit = 906 - cudaErrorCapturedEvent = 907 - cudaErrorStreamCaptureWrongThread = 908 - cudaErrorTimeout = 909 - cudaErrorGraphExecUpdateFailure = 910 - cudaErrorUnknown = 999 - cudaErrorApiFailureBase = 10000 - - ctypedef cudaError cudaError_t - - ctypedef enum CUresult: - CUDA_SUCCESS = 0 - CUDA_ERROR_INVALID_VALUE = 1 - CUDA_ERROR_OUT_OF_MEMORY = 2 - CUDA_ERROR_NOT_INITIALIZED = 3 - CUDA_ERROR_DEINITIALIZED = 4 - CUDA_ERROR_PROFILER_DISABLED = 5 - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6 - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7 - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8 - CUDA_ERROR_NO_DEVICE = 100 - CUDA_ERROR_INVALID_DEVICE = 101 - CUDA_ERROR_INVALID_IMAGE = 200 - CUDA_ERROR_INVALID_CONTEXT = 201 - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202 - CUDA_ERROR_MAP_FAILED = 205 - CUDA_ERROR_UNMAP_FAILED = 206 - CUDA_ERROR_ARRAY_IS_MAPPED = 207 - CUDA_ERROR_ALREADY_MAPPED = 208 - CUDA_ERROR_NO_BINARY_FOR_GPU = 209 - CUDA_ERROR_ALREADY_ACQUIRED = 210 - CUDA_ERROR_NOT_MAPPED = 211 - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212 - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213 - CUDA_ERROR_ECC_UNCORRECTABLE = 214 - CUDA_ERROR_UNSUPPORTED_LIMIT = 215 - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216 - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217 - CUDA_ERROR_INVALID_PTX = 218 - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219 - CUDA_ERROR_NVLINK_UNCORRECTABLE = 220 - CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221 - CUDA_ERROR_INVALID_SOURCE = 300 - CUDA_ERROR_FILE_NOT_FOUND = 301 - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302 - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303 - CUDA_ERROR_OPERATING_SYSTEM = 304 - CUDA_ERROR_INVALID_HANDLE = 400 - CUDA_ERROR_ILLEGAL_STATE = 401 - CUDA_ERROR_NOT_FOUND = 500 - CUDA_ERROR_NOT_READY = 600 - CUDA_ERROR_ILLEGAL_ADDRESS = 700 - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701 - CUDA_ERROR_LAUNCH_TIMEOUT = 702 - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703 - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704 - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705 - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708 - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709 - CUDA_ERROR_ASSERT = 710 - CUDA_ERROR_TOO_MANY_PEERS = 711 - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712 - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713 - CUDA_ERROR_HARDWARE_STACK_ERROR = 714 - CUDA_ERROR_ILLEGAL_INSTRUCTION = 715 - CUDA_ERROR_MISALIGNED_ADDRESS = 716 - CUDA_ERROR_INVALID_ADDRESS_SPACE = 717 - CUDA_ERROR_INVALID_PC = 718 - CUDA_ERROR_LAUNCH_FAILED = 719 - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720 - CUDA_ERROR_NOT_PERMITTED = 800 - CUDA_ERROR_NOT_SUPPORTED = 801 - CUDA_ERROR_SYSTEM_NOT_READY = 802 - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803 - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804 - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 - CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903 - CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904 - CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905 - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906 - CUDA_ERROR_CAPTURED_EVENT = 907 - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 - CUDA_ERROR_TIMEOUT = 909 - CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910 - CUDA_ERROR_UNKNOWN = 999 - - ctypedef struct CUuuid_st: - char bytes[16] - - ctypedef CUuuid_st cudaUUID_t - - ctypedef struct cudaDeviceProp: - int ECCEnabled - int asyncEngineCount - int canMapHostMemory - int canUseHostPointerForRegisteredMem - int clockRate - int computeMode - int computePreemptionSupported - int concurrentKernels - int concurrentManagedAccess - int cooperativeLaunch - int cooperativeMultiDeviceLaunch - int deviceOverlap - int directManagedMemAccessFromHost - int globalL1CacheSupported - int hostNativeAtomicSupported - int integrated - int isMultiGpuBoard - int kernelExecTimeoutEnabled - int l2CacheSize - int localL1CacheSupported - char luid[8] - unsigned int luidDeviceNodeMask - int major - int managedMemory - int maxGridSize[3] - int maxSurface1D - int maxSurface1DLayered[2] - int maxSurface2D[2] - int maxSurface2DLayered[3] - int maxSurface3D[3] - int maxSurfaceCubemap - int maxSurfaceCubemapLayered[2] - int maxTexture1D - int maxTexture1DLayered[2] - int maxTexture1DLinear - int maxTexture1DMipmap - int maxTexture2D[2] - int maxTexture2DGather[2] - int maxTexture2DLayered[3] - int maxTexture2DLinear[3] - int maxTexture2DMipmap[2] - int maxTexture3D[3] - int maxTexture3DAlt[3] - int maxTextureCubemap - int maxTextureCubemapLayered[2] - int maxThreadsDim[3] - int maxThreadsPerBlock - int maxThreadsPerMultiProcessor - size_t memPitch - int memoryBusWidth - int memoryClockRate - int minor - int multiGpuBoardGroupID - int multiProcessorCount - char name[256] - int pageableMemoryAccess - int pageableMemoryAccessUsesHostPageTables - int pciBusID - int pciDeviceID - int pciDomainID - int regsPerBlock - int regsPerMultiprocessor - size_t sharedMemPerBlock - size_t sharedMemPerBlockOptin - size_t sharedMemPerMultiprocessor - int singleToDoublePrecisionPerfRatio - int streamPrioritiesSupported - size_t surfaceAlignment - int tccDriver - size_t textureAlignment - size_t texturePitchAlignment - size_t totalConstMem - size_t totalGlobalMem - int unifiedAddressing - cudaUUID_t uuid - int warpSize - - CUresult cuDeviceGetName(char* name, int length, int device) - - CUresult cuGetErrorName(CUresult error, const char** pStr) - CUresult cuGetErrorString(CUresult error, const char** pStr) - -cdef extern from "cuda_runtime_api.h" nogil: - - cudaError_t cudaDriverGetVersion(int* driverVersion) - cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) - cudaError_t cudaGetDeviceCount(int* count) - cudaError_t cudaGetDevice(int* device) - cudaError_t cudaDeviceGetAttribute(int* value, - cudaDeviceAttr attr, - int device) - cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) - cudaError_t cudaSetDevice(int device) - - const char* cudaGetErrorString(cudaError_t error) - const char* cudaGetErrorName(cudaError_t error) diff --git a/python/rmm/_cuda/11.x/gpu.pxi b/python/rmm/_cuda/11.x/gpu.pxi deleted file mode 100644 index 856ff04e9..000000000 --- a/python/rmm/_cuda/11.x/gpu.pxi +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# For CUDA 11.0, 11.1 and 11.2 - -cdef extern from "cuda.h" nogil: - cpdef enum cudaDeviceAttr: - cudaDevAttrMaxThreadsPerBlock = 1 - cudaDevAttrMaxBlockDimX = 2 - cudaDevAttrMaxBlockDimY = 3 - cudaDevAttrMaxBlockDimZ = 4 - cudaDevAttrMaxGridDimX = 5 - cudaDevAttrMaxGridDimY = 6 - cudaDevAttrMaxGridDimZ = 7 - cudaDevAttrMaxSharedMemoryPerBlock = 8 - cudaDevAttrTotalConstantMemory = 9 - cudaDevAttrWarpSize = 10 - cudaDevAttrMaxPitch = 11 - cudaDevAttrMaxRegistersPerBlock = 12 - cudaDevAttrClockRate = 13 - cudaDevAttrTextureAlignment = 14 - cudaDevAttrGpuOverlap = 15 - cudaDevAttrMultiProcessorCount = 16 - cudaDevAttrKernelExecTimeout = 17 - cudaDevAttrIntegrated = 18 - cudaDevAttrCanMapHostMemory = 19 - cudaDevAttrComputeMode = 20 - cudaDevAttrMaxTexture1DWidth = 21 - cudaDevAttrMaxTexture2DWidth = 22 - cudaDevAttrMaxTexture2DHeight = 23 - cudaDevAttrMaxTexture3DWidth = 24 - cudaDevAttrMaxTexture3DHeight = 25 - cudaDevAttrMaxTexture3DDepth = 26 - cudaDevAttrMaxTexture2DLayeredWidth = 27 - cudaDevAttrMaxTexture2DLayeredHeight = 28 - cudaDevAttrMaxTexture2DLayeredLayers = 29 - cudaDevAttrSurfaceAlignment = 30 - cudaDevAttrConcurrentKernels = 31 - cudaDevAttrEccEnabled = 32 - cudaDevAttrPciBusId = 33 - cudaDevAttrPciDeviceId = 34 - cudaDevAttrTccDriver = 35 - cudaDevAttrMemoryClockRate = 36 - cudaDevAttrGlobalMemoryBusWidth = 37 - cudaDevAttrL2CacheSize = 38 - cudaDevAttrMaxThreadsPerMultiProcessor = 39 - cudaDevAttrAsyncEngineCount = 40 - cudaDevAttrUnifiedAddressing = 41 - cudaDevAttrMaxTexture1DLayeredWidth = 42 - cudaDevAttrMaxTexture1DLayeredLayers = 43 - cudaDevAttrMaxTexture2DGatherWidth = 45 - cudaDevAttrMaxTexture2DGatherHeight = 46 - cudaDevAttrMaxTexture3DWidthAlt = 47 - cudaDevAttrMaxTexture3DHeightAlt = 48 - cudaDevAttrMaxTexture3DDepthAlt = 49 - cudaDevAttrPciDomainId = 50 - cudaDevAttrTexturePitchAlignment = 51 - cudaDevAttrMaxTextureCubemapWidth = 52 - cudaDevAttrMaxTextureCubemapLayeredWidth = 53 - cudaDevAttrMaxTextureCubemapLayeredLayers = 54 - cudaDevAttrMaxSurface1DWidth = 55 - cudaDevAttrMaxSurface2DWidth = 56 - cudaDevAttrMaxSurface2DHeight = 57 - cudaDevAttrMaxSurface3DWidth = 58 - cudaDevAttrMaxSurface3DHeight = 59 - cudaDevAttrMaxSurface3DDepth = 60 - cudaDevAttrMaxSurface1DLayeredWidth = 61 - cudaDevAttrMaxSurface1DLayeredLayers = 62 - cudaDevAttrMaxSurface2DLayeredWidth = 63 - cudaDevAttrMaxSurface2DLayeredHeight = 64 - cudaDevAttrMaxSurface2DLayeredLayers = 65 - cudaDevAttrMaxSurfaceCubemapWidth = 66 - cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67 - cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68 - cudaDevAttrMaxTexture1DLinearWidth = 69 - cudaDevAttrMaxTexture2DLinearWidth = 70 - cudaDevAttrMaxTexture2DLinearHeight = 71 - cudaDevAttrMaxTexture2DLinearPitch = 72 - cudaDevAttrMaxTexture2DMipmappedWidth = 73 - cudaDevAttrMaxTexture2DMipmappedHeight = 74 - cudaDevAttrComputeCapabilityMajor = 75 - cudaDevAttrComputeCapabilityMinor = 76 - cudaDevAttrMaxTexture1DMipmappedWidth = 77 - cudaDevAttrStreamPrioritiesSupported = 78 - cudaDevAttrGlobalL1CacheSupported = 79 - cudaDevAttrLocalL1CacheSupported = 80 - cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81 - cudaDevAttrMaxRegistersPerMultiprocessor = 82 - cudaDevAttrManagedMemory = 83 - cudaDevAttrIsMultiGpuBoard = 84 - cudaDevAttrMultiGpuBoardGroupID = 85 - cudaDevAttrHostNativeAtomicSupported = 86 - cudaDevAttrSingleToDoublePrecisionPerfRatio = 87 - cudaDevAttrPageableMemoryAccess = 88 - cudaDevAttrConcurrentManagedAccess = 89 - cudaDevAttrComputePreemptionSupported = 90 - cudaDevAttrCanUseHostPointerForRegisteredMem = 91 - cudaDevAttrReserved92 = 92 - cudaDevAttrReserved93 = 93 - cudaDevAttrReserved94 = 94 - cudaDevAttrCooperativeLaunch = 95 - cudaDevAttrCooperativeMultiDeviceLaunch = 96 - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 - cudaDevAttrCanFlushRemoteWrites = 98 - cudaDevAttrHostRegisterSupported = 99 - cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100 - cudaDevAttrDirectManagedMemAccessFromHost = 101 - cudaDevAttrMaxBlocksPerMultiprocessor = 106 - cudaDevAttrReservedSharedMemoryPerBlock = 111 - - cpdef enum cudaError: - cudaSuccess = 0 - cudaErrorInvalidValue = 1 - cudaErrorMemoryAllocation = 2 - cudaErrorInitializationError = 3 - cudaErrorCudartUnloading = 4 - cudaErrorProfilerDisabled = 5 - cudaErrorProfilerNotInitialized = 6 - cudaErrorProfilerAlreadyStarted = 7 - cudaErrorProfilerAlreadyStopped = 8 - cudaErrorInvalidConfiguration = 9 - cudaErrorInvalidPitchValue = 12 - cudaErrorInvalidSymbol = 13 - cudaErrorInvalidHostPointer = 16 - cudaErrorInvalidDevicePointer = 17 - cudaErrorInvalidTexture = 18 - cudaErrorInvalidTextureBinding = 19 - cudaErrorInvalidChannelDescriptor = 20 - cudaErrorInvalidMemcpyDirection = 21 - cudaErrorAddressOfConstant = 22 - cudaErrorTextureFetchFailed = 23 - cudaErrorTextureNotBound = 24 - cudaErrorSynchronizationError = 25 - cudaErrorInvalidFilterSetting = 26 - cudaErrorInvalidNormSetting = 27 - cudaErrorMixedDeviceExecution = 28 - cudaErrorNotYetImplemented = 31 - cudaErrorMemoryValueTooLarge = 32 - cudaErrorInsufficientDriver = 35 - cudaErrorInvalidSurface = 37 - cudaErrorDuplicateVariableName = 43 - cudaErrorDuplicateTextureName = 44 - cudaErrorDuplicateSurfaceName = 45 - cudaErrorDevicesUnavailable = 46 - cudaErrorIncompatibleDriverContext = 49 - cudaErrorMissingConfiguration = 52 - cudaErrorPriorLaunchFailure = 53 - cudaErrorLaunchMaxDepthExceeded = 65 - cudaErrorLaunchFileScopedTex = 66 - cudaErrorLaunchFileScopedSurf = 67 - cudaErrorSyncDepthExceeded = 68 - cudaErrorLaunchPendingCountExceeded = 69 - cudaErrorInvalidDeviceFunction = 98 - cudaErrorNoDevice = 100 - cudaErrorInvalidDevice = 101 - cudaErrorStartupFailure = 127 - cudaErrorInvalidKernelImage = 200 - cudaErrorDeviceUninitialized = 201 - cudaErrorMapBufferObjectFailed = 205 - cudaErrorUnmapBufferObjectFailed = 206 - cudaErrorArrayIsMapped = 207 - cudaErrorAlreadyMapped = 208 - cudaErrorNoKernelImageForDevice = 209 - cudaErrorAlreadyAcquired = 210 - cudaErrorNotMapped = 211 - cudaErrorNotMappedAsArray = 212 - cudaErrorNotMappedAsPointer = 213 - cudaErrorECCUncorrectable = 214 - cudaErrorUnsupportedLimit = 215 - cudaErrorDeviceAlreadyInUse = 216 - cudaErrorPeerAccessUnsupported = 217 - cudaErrorInvalidPtx = 218 - cudaErrorInvalidGraphicsContext = 219 - cudaErrorNvlinkUncorrectable = 220 - cudaErrorJitCompilerNotFound = 221 - cudaErrorInvalidSource = 300 - cudaErrorFileNotFound = 301 - cudaErrorSharedObjectSymbolNotFound = 302 - cudaErrorSharedObjectInitFailed = 303 - cudaErrorOperatingSystem = 304 - cudaErrorInvalidResourceHandle = 400 - cudaErrorIllegalState = 401 - cudaErrorSymbolNotFound = 500 - cudaErrorNotReady = 600 - cudaErrorIllegalAddress = 700 - cudaErrorLaunchOutOfResources = 701 - cudaErrorLaunchTimeout = 702 - cudaErrorLaunchIncompatibleTexturing = 703 - cudaErrorPeerAccessAlreadyEnabled = 704 - cudaErrorPeerAccessNotEnabled = 705 - cudaErrorSetOnActiveProcess = 708 - cudaErrorContextIsDestroyed = 709 - cudaErrorAssert = 710 - cudaErrorTooManyPeers = 711 - cudaErrorHostMemoryAlreadyRegistered = 712 - cudaErrorHostMemoryNotRegistered = 713 - cudaErrorHardwareStackError = 714 - cudaErrorIllegalInstruction = 715 - cudaErrorMisalignedAddress = 716 - cudaErrorInvalidAddressSpace = 717 - cudaErrorInvalidPc = 718 - cudaErrorLaunchFailure = 719 - cudaErrorCooperativeLaunchTooLarge = 720 - cudaErrorNotPermitted = 800 - cudaErrorNotSupported = 801 - cudaErrorSystemNotReady = 802 - cudaErrorSystemDriverMismatch = 803 - cudaErrorCompatNotSupportedOnDevice = 804 - cudaErrorStreamCaptureUnsupported = 900 - cudaErrorStreamCaptureInvalidated = 901 - cudaErrorStreamCaptureMerge = 902 - cudaErrorStreamCaptureUnmatched = 903 - cudaErrorStreamCaptureUnjoined = 904 - cudaErrorStreamCaptureIsolation = 905 - cudaErrorStreamCaptureImplicit = 906 - cudaErrorCapturedEvent = 907 - cudaErrorStreamCaptureWrongThread = 908 - cudaErrorTimeout = 909 - cudaErrorGraphExecUpdateFailure = 910 - cudaErrorUnknown = 999 - cudaErrorApiFailureBase = 10000 - - ctypedef cudaError cudaError_t - - ctypedef enum CUresult: - CUDA_SUCCESS = 0 - CUDA_ERROR_INVALID_VALUE = 1 - CUDA_ERROR_OUT_OF_MEMORY = 2 - CUDA_ERROR_NOT_INITIALIZED = 3 - CUDA_ERROR_DEINITIALIZED = 4 - CUDA_ERROR_PROFILER_DISABLED = 5 - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6 - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7 - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8 - CUDA_ERROR_NO_DEVICE = 100 - CUDA_ERROR_INVALID_DEVICE = 101 - CUDA_ERROR_INVALID_IMAGE = 200 - CUDA_ERROR_INVALID_CONTEXT = 201 - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202 - CUDA_ERROR_MAP_FAILED = 205 - CUDA_ERROR_UNMAP_FAILED = 206 - CUDA_ERROR_ARRAY_IS_MAPPED = 207 - CUDA_ERROR_ALREADY_MAPPED = 208 - CUDA_ERROR_NO_BINARY_FOR_GPU = 209 - CUDA_ERROR_ALREADY_ACQUIRED = 210 - CUDA_ERROR_NOT_MAPPED = 211 - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212 - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213 - CUDA_ERROR_ECC_UNCORRECTABLE = 214 - CUDA_ERROR_UNSUPPORTED_LIMIT = 215 - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216 - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217 - CUDA_ERROR_INVALID_PTX = 218 - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219 - CUDA_ERROR_NVLINK_UNCORRECTABLE = 220 - CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221 - CUDA_ERROR_INVALID_SOURCE = 300 - CUDA_ERROR_FILE_NOT_FOUND = 301 - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302 - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303 - CUDA_ERROR_OPERATING_SYSTEM = 304 - CUDA_ERROR_INVALID_HANDLE = 400 - CUDA_ERROR_ILLEGAL_STATE = 401 - CUDA_ERROR_NOT_FOUND = 500 - CUDA_ERROR_NOT_READY = 600 - CUDA_ERROR_ILLEGAL_ADDRESS = 700 - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701 - CUDA_ERROR_LAUNCH_TIMEOUT = 702 - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703 - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704 - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705 - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708 - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709 - CUDA_ERROR_ASSERT = 710 - CUDA_ERROR_TOO_MANY_PEERS = 711 - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712 - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713 - CUDA_ERROR_HARDWARE_STACK_ERROR = 714 - CUDA_ERROR_ILLEGAL_INSTRUCTION = 715 - CUDA_ERROR_MISALIGNED_ADDRESS = 716 - CUDA_ERROR_INVALID_ADDRESS_SPACE = 717 - CUDA_ERROR_INVALID_PC = 718 - CUDA_ERROR_LAUNCH_FAILED = 719 - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720 - CUDA_ERROR_NOT_PERMITTED = 800 - CUDA_ERROR_NOT_SUPPORTED = 801 - CUDA_ERROR_SYSTEM_NOT_READY = 802 - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803 - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804 - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 - CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903 - CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904 - CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905 - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906 - CUDA_ERROR_CAPTURED_EVENT = 907 - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 - CUDA_ERROR_TIMEOUT = 909 - CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910 - CUDA_ERROR_UNKNOWN = 999 - - ctypedef struct CUuuid_st: - char bytes[16] - - ctypedef CUuuid_st cudaUUID_t - - ctypedef struct cudaDeviceProp: - int ECCEnabled - int asyncEngineCount - int canMapHostMemory - int canUseHostPointerForRegisteredMem - int clockRate - int computeMode - int computePreemptionSupported - int concurrentKernels - int concurrentManagedAccess - int cooperativeLaunch - int cooperativeMultiDeviceLaunch - int deviceOverlap - int directManagedMemAccessFromHost - int globalL1CacheSupported - int hostNativeAtomicSupported - int integrated - int isMultiGpuBoard - int kernelExecTimeoutEnabled - int l2CacheSize - int localL1CacheSupported - char luid[8] - unsigned int luidDeviceNodeMask - int major - int managedMemory - int maxGridSize[3] - int maxSurface1D - int maxSurface1DLayered[2] - int maxSurface2D[2] - int maxSurface2DLayered[3] - int maxSurface3D[3] - int maxSurfaceCubemap - int maxSurfaceCubemapLayered[2] - int maxTexture1D - int maxTexture1DLayered[2] - int maxTexture1DLinear - int maxTexture1DMipmap - int maxTexture2D[2] - int maxTexture2DGather[2] - int maxTexture2DLayered[3] - int maxTexture2DLinear[3] - int maxTexture2DMipmap[2] - int maxTexture3D[3] - int maxTexture3DAlt[3] - int maxTextureCubemap - int maxTextureCubemapLayered[2] - int maxThreadsDim[3] - int maxThreadsPerBlock - int maxThreadsPerMultiProcessor - size_t memPitch - int memoryBusWidth - int memoryClockRate - int minor - int multiGpuBoardGroupID - int multiProcessorCount - char name[256] - int pageableMemoryAccess - int pageableMemoryAccessUsesHostPageTables - int pciBusID - int pciDeviceID - int pciDomainID - int regsPerBlock - int regsPerMultiprocessor - size_t sharedMemPerBlock - size_t sharedMemPerBlockOptin - size_t sharedMemPerMultiprocessor - int singleToDoublePrecisionPerfRatio - int streamPrioritiesSupported - size_t surfaceAlignment - int tccDriver - size_t textureAlignment - size_t texturePitchAlignment - size_t totalConstMem - size_t totalGlobalMem - int unifiedAddressing - cudaUUID_t uuid - int warpSize - int accessPolicyMaxWindowSize - int maxBlocksPerMultiProcessor - int persistingL2CacheMaxSize - size_t reservedSharedMemPerBlock - - CUresult cuDeviceGetName(char* name, int length, int device) - - CUresult cuGetErrorName(CUresult error, const char** pStr) - CUresult cuGetErrorString(CUresult error, const char** pStr) - -cdef extern from "cuda_runtime_api.h" nogil: - - cudaError_t cudaDriverGetVersion(int* driverVersion) - cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) - cudaError_t cudaGetDeviceCount(int* count) - cudaError_t cudaGetDevice(int* device) - cudaError_t cudaDeviceGetAttribute(int* value, - cudaDeviceAttr attr, - int device) - cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) - cudaError_t cudaSetDevice(int device) - - const char* cudaGetErrorString(cudaError_t error) - const char* cudaGetErrorName(cudaError_t error) diff --git a/python/rmm/_cuda/gpu.pyx b/python/rmm/_cuda/gpu.py similarity index 55% rename from python/rmm/_cuda/gpu.pyx rename to python/rmm/_cuda/gpu.py index f570df811..a1ce61564 100644 --- a/python/rmm/_cuda/gpu.pyx +++ b/python/rmm/_cuda/gpu.py @@ -1,59 +1,33 @@ # Copyright (c) 2020, NVIDIA CORPORATION. -from rmm._cuda.gpu cimport ( - CUresult, - cudaDeviceAttr, - cudaDeviceGetAttribute, - cudaDeviceProp, - cudaDriverGetVersion, - cudaError, - cudaError_t, - cudaGetDeviceCount, - cudaGetDeviceProperties, - cudaGetErrorName, - cudaGetErrorString, - cudaRuntimeGetVersion, - cuDeviceGetName, - cuGetErrorName, - cuGetErrorString, -) - -from enum import IntEnum +from cuda import cuda, cudart class CUDARuntimeError(RuntimeError): - - def __init__(self, cudaError_t status): + def __init__(self, status: cudart.cudaError_t): self.status = status - cdef str name = cudaGetErrorName(status).decode() - cdef str msg = cudaGetErrorString(status).decode() + + _, name = cudart.cudaGetErrorName(status) + _, msg = cudart.cudaGetErrorString(status) + super(CUDARuntimeError, self).__init__( - '%s: %s' % (name, msg)) + f"{name.decode()}: {msg.decode()}" + ) def __reduce__(self): return (type(self), (self.status,)) class CUDADriverError(RuntimeError): - - def __init__(self, CUresult status): + def __init__(self, status: cuda.CUresult): self.status = status - cdef const char* name_cstr - cdef CUresult name_status = cuGetErrorName(status, &name_cstr) - if name_status != 0: - raise CUDADriverError(name_status) - - cdef const char* msg_cstr - cdef CUresult msg_status = cuGetErrorString(status, &msg_cstr) - if msg_status != 0: - raise CUDADriverError(msg_status) - - cdef str name = name_cstr.decode() - cdef str msg = msg_cstr.decode() + _, name = cuda.cuGetErrorName(status) + _, msg = cuda.cuGetErrorString(status) super(CUDADriverError, self).__init__( - '%s: %s' % (name, msg)) + f"{name.decode()}: {msg.decode()}" + ) def __reduce__(self): return (type(self), (self.status,)) @@ -69,9 +43,8 @@ def driverGetVersion(): This function automatically raises CUDARuntimeError with error message and status code. """ - cdef int version - cdef cudaError_t status = cudaDriverGetVersion(&version) - if status != cudaError.cudaSuccess: + status, version = cudart.cudaDriverGetVersion() + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) return version @@ -80,14 +53,13 @@ def getDevice(): """ Get the current CUDA device """ - cdef int current_device - cdef cudaError_t status = cudaGetDevice(¤t_device) - if status != cudaError.cudaSuccess: + status, device = cudart.cudaGetDevice() + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) - return current_device + return device -def setDevice(int device): +def setDevice(device: int): """ Set the current CUDA device Parameters @@ -95,9 +67,8 @@ def setDevice(int device): device : int The ID of the device to set as current """ - cdef cudaError_t status = cudaSetDevice(device) - - if status != cudaError.cudaSuccess: + (status,) = cudart.cudaSetDevice(device) + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) @@ -110,10 +81,8 @@ def runtimeGetVersion(): This function automatically raises CUDARuntimeError with error message and status code. """ - - cdef int version - cdef cudaError_t status = cudaRuntimeGetVersion(&version) - if status != cudaError.cudaSuccess: + status, version = cudart.cudaRuntimeGetVersion() + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) return version @@ -126,16 +95,13 @@ def getDeviceCount(): This function automatically raises CUDARuntimeError with error message and status code. """ - - cdef int count - cdef cudaError_t status = cudaGetDeviceCount(&count) - - if status != cudaError.cudaSuccess: + status, count = cudart.cudaGetDeviceCount() + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) return count -def getDeviceAttribute(cudaDeviceAttr attr, int device): +def getDeviceAttribute(attr: cudart.cudaDeviceAttr, device: int): """ Returns information about the device. @@ -149,15 +115,13 @@ def getDeviceAttribute(cudaDeviceAttr attr, int device): This function automatically raises CUDARuntimeError with error message and status code. """ - - cdef int value - cdef cudaError_t status = cudaDeviceGetAttribute(&value, attr, device) - if status != cudaError.cudaSuccess: + status, value = cudart.cudaDeviceGetAttribute(attr, device) + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) return value -def getDeviceProperties(int device): +def getDeviceProperties(device: int): """ Returns information about the compute-device. @@ -169,15 +133,13 @@ def getDeviceProperties(int device): This function automatically raises CUDARuntimeError with error message and status code. """ - - cdef cudaDeviceProp prop - cdef cudaError_t status = cudaGetDeviceProperties(&prop, device) - if status != cudaError.cudaSuccess: + status, prop = cudart.cudaGetDeviceProperties(device) + if status != cudart.cudaError_t.cudaSuccess: raise CUDARuntimeError(status) return prop -def deviceGetName(int device): +def deviceGetName(device: int): """ Returns an identifer string for the device. @@ -190,12 +152,7 @@ def deviceGetName(int device): and status code. """ - cdef char[256] device_name - cdef CUresult status = cuDeviceGetName( - device_name, - sizeof(device_name), - device - ) - if status != 0: + status, device_name = cuda.cuDeviceGetName(256, cuda.CUdevice(device)) + if status != cuda.CUresult.CUDA_SUCCESS: raise CUDADriverError(status) return device_name.decode() diff --git a/python/rmm/_cuda/stream.pxd b/python/rmm/_cuda/stream.pxd index 0806a7310..6aa4e0b24 100644 --- a/python/rmm/_cuda/stream.pxd +++ b/python/rmm/_cuda/stream.pxd @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from cuda.ccudart cimport cudaStream_t from libc.stdint cimport uintptr_t from libcpp cimport bool from rmm._lib.cuda_stream_view cimport cuda_stream_view -from rmm._lib.lib cimport cudaStream_t cdef class Stream: diff --git a/python/rmm/_cuda/stream.pyx b/python/rmm/_cuda/stream.pyx index 4c9890d51..4f2ce26d0 100644 --- a/python/rmm/_cuda/stream.pyx +++ b/python/rmm/_cuda/stream.pyx @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from cuda.ccudart cimport cudaStream_t from libc.stdint cimport uintptr_t from libcpp cimport bool @@ -21,7 +22,6 @@ from rmm._lib.cuda_stream_view cimport ( cuda_stream_per_thread, cuda_stream_view, ) -from rmm._lib.lib cimport cudaStream_t from numba import cuda diff --git a/python/rmm/_lib/cuda_stream.pxd b/python/rmm/_lib/cuda_stream.pxd index 4eae4bc76..1eed1cefb 100644 --- a/python/rmm/_lib/cuda_stream.pxd +++ b/python/rmm/_lib/cuda_stream.pxd @@ -13,11 +13,11 @@ # limitations under the License. cimport cython +from cuda.ccudart cimport cudaStream_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.cuda_stream_view cimport cuda_stream_view -from rmm._lib.lib cimport cudaStream_t cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil: diff --git a/python/rmm/_lib/cuda_stream.pyx b/python/rmm/_lib/cuda_stream.pyx index 44ca6d75e..d93af2509 100644 --- a/python/rmm/_lib/cuda_stream.pyx +++ b/python/rmm/_lib/cuda_stream.pyx @@ -13,6 +13,7 @@ # limitations under the License. cimport cython +from cuda.ccudart cimport cudaStream_t from libc.stdint cimport uintptr_t from libcpp cimport bool diff --git a/python/rmm/_lib/cuda_stream_view.pxd b/python/rmm/_lib/cuda_stream_view.pxd index 7031a03ea..bf0d33c24 100644 --- a/python/rmm/_lib/cuda_stream_view.pxd +++ b/python/rmm/_lib/cuda_stream_view.pxd @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from cuda.ccudart cimport cudaStream_t from libcpp cimport bool -from rmm._lib.lib cimport cudaStream_t - cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil: cdef cppclass cuda_stream_view: diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index ece2f911f..bc761c29c 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -20,20 +20,20 @@ from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._cuda.gpu cimport cudaError, cudaError_t from rmm._cuda.stream cimport Stream from rmm._cuda.stream import DEFAULT_STREAM -from rmm._lib.lib cimport ( +cimport cuda.ccudart as ccudart +from cuda.ccudart cimport ( + cudaError, + cudaError_t, cudaMemcpyAsync, - cudaMemcpyDeviceToDevice, - cudaMemcpyDeviceToHost, - cudaMemcpyHostToDevice, cudaMemcpyKind, cudaStream_t, cudaStreamSynchronize, ) + from rmm._lib.memory_resource cimport get_current_device_resource @@ -78,7 +78,6 @@ cdef class DeviceBuffer: >>> db = rmm.DeviceBuffer(size=5) """ cdef const void* c_ptr - cdef cudaError_t err with nogil: c_ptr = ptr @@ -344,7 +343,7 @@ cpdef DeviceBuffer to_device(const unsigned char[::1] b, cdef void _copy_async(const void* src, void* dst, size_t count, - cudaMemcpyKind kind, + ccudart.cudaMemcpyKind kind, cuda_stream_view stream) nogil: """ Asynchronously copy data between host and/or device pointers @@ -403,7 +402,7 @@ cpdef void copy_ptr_to_host(uintptr_t db, with nogil: _copy_async(db, &hb[0], len(hb), - cudaMemcpyDeviceToHost, stream.view()) + cudaMemcpyKind.cudaMemcpyDeviceToHost, stream.view()) if stream.c_is_default(): stream.c_synchronize() @@ -447,7 +446,7 @@ cpdef void copy_host_to_ptr(const unsigned char[::1] hb, with nogil: _copy_async(&hb[0], db, len(hb), - cudaMemcpyHostToDevice, stream.view()) + cudaMemcpyKind.cudaMemcpyHostToDevice, stream.view()) if stream.c_is_default(): stream.c_synchronize() @@ -480,4 +479,4 @@ cpdef void copy_device_to_ptr(uintptr_t d_src, with nogil: _copy_async(d_src, d_dst, count, - cudaMemcpyDeviceToDevice, stream.view()) + cudaMemcpyKind.cudaMemcpyDeviceToDevice, stream.view()) diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/_lib/lib.pxd index c06d69872..e35b672e4 100644 --- a/python/rmm/_lib/lib.pxd +++ b/python/rmm/_lib/lib.pxd @@ -17,24 +17,4 @@ from libcpp cimport bool from libcpp.utility cimport pair from libcpp.vector cimport vector -from rmm._cuda.gpu cimport cudaError_t - ctypedef pair[const char*, unsigned int] caller_pair - - -cdef extern from * nogil: - - ctypedef void* cudaStream_t "cudaStream_t" - - ctypedef enum cudaMemcpyKind "cudaMemcpyKind": - cudaMemcpyHostToHost = 0 - cudaMemcpyHostToDevice = 1 - cudaMemcpyDeviceToHost = 2 - cudaMemcpyDeviceToDevice = 3 - cudaMemcpyDefault = 4 - - cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, - cudaMemcpyKind kind) - cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, - cudaMemcpyKind kind, cudaStream_t stream) - cudaError_t cudaStreamSynchronize(cudaStream_t stream) diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx index d7711ed8a..16723a0a3 100644 --- a/python/rmm/_lib/memory_resource.pyx +++ b/python/rmm/_lib/memory_resource.pyx @@ -23,7 +23,9 @@ from libcpp.cast cimport dynamic_cast from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.string cimport string -from rmm._cuda.gpu import CUDARuntimeError, cudaError, getDevice, setDevice +from cuda.cudart import cudaError_t + +from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice # NOTE: Keep extern declarations in .pyx file as much as possible to avoid @@ -705,7 +707,7 @@ cpdef void _initialize( try: original_device = getDevice() except CUDARuntimeError as e: - if e.status == cudaError.cudaErrorNoDevice: + if e.status == cudaError_t.cudaErrorNoDevice: warnings.warn(e.msg) else: raise e diff --git a/python/setup.py b/python/setup.py index 8edaf4c31..50888ab3c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,6 +1,5 @@ # Copyright (c) 2019-2021, NVIDIA CORPORATION. -import filecmp import glob import os import re @@ -26,7 +25,7 @@ import versioneer -install_requires = ["numba", "cython"] +install_requires = ["numba", "cython", "cuda-python"] def get_cuda_version_from_header(cuda_include_dir): @@ -75,43 +74,6 @@ def get_cuda_version_from_header(cuda_include_dir): # use uninstalled headers in source tree rmm_include_dir = "../include" -# Preprocessor step to specify correct pxd file with -# valid symbols for specific version of CUDA. - -cwd = os.getcwd() -files_to_preprocess = ["gpu.pxd"] - -# The .pxi file is unchanged between some CUDA versions -# (e.g., 11.0 & 11.1), so we keep only a single copy -# of it -cuda_version_to_pxi_dir = { - "10.1": "10.1", - "10.2": "10.2", - "11": "11.x", -} - -for pxd_basename in files_to_preprocess: - pxi_basename = os.path.splitext(pxd_basename)[0] + ".pxi" - pxi_dir = cuda_version_to_pxi_dir.get(CUDA_VERSION) - if not pxi_dir: - # didn't get an exact match on major.minor version - see if - # we have a match on just the major version - pxi_dir = cuda_version_to_pxi_dir.get(CUDA_VERSION.split(".")[0]) - - if pxi_dir: - pxi_pathname = os.path.join(cwd, "rmm/_cuda", pxi_dir, pxi_basename,) - pxd_pathname = os.path.join(cwd, "rmm/_cuda", pxd_basename) - try: - if filecmp.cmp(pxi_pathname, pxd_pathname): - # files are the same, no need to copy - continue - except FileNotFoundError: - # pxd_pathname doesn't exist yet - pass - shutil.copyfile(pxi_pathname, pxd_pathname) - else: - raise TypeError(f"{CUDA_VERSION} is not supported.") - include_dirs = [ rmm_include_dir, os.path.dirname(sysconfig.get_path("include")),