[Fix] Clean up Runtime API.

lshqqytiger · May 21, 2024 · 2ad9ad6 · 2ad9ad6
1 parent 11cc584
commit 2ad9ad6
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 47 deletions.
diff --git a/hip_runtime-sys/src/hip_runtime_api.rs b/hip_runtime-sys/src/hip_runtime_api.rs
@@ -7154,9 +7154,17 @@ extern "C" {
 extern "C" {
     #[must_use]
     pub fn __hipRegisterFatBinary(
-        data: *const ::std::os::raw::c_void,
+        data: *mut ::std::os::raw::c_void,
     ) -> *mut *mut ::std::os::raw::c_void;
 }
+/*
+extern "C" {
+    #[must_use]
+    pub fn __hipRegisterFatBinaryEnd(
+        fatCubinHandle: *mut *mut ::std::os::raw::c_void,
+    ) -> ::std::os::raw::c_void;
+}
+*/
 extern "C" {
     #[must_use]
     pub fn __hipRegisterFunction(
@@ -7172,6 +7180,17 @@ extern "C" {
         wSize: *mut ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_void;
 }
+/*
+extern "C" {
+    #[must_use]
+    pub fn __hipRegisterHostVar(
+        fatCubinHandle: *mut *mut ::std::os::raw::c_void,
+        deviceName: *const ::std::os::raw::c_char,
+        hostVar: *mut ::std::os::raw::c_char,
+        size: usize,
+    ) -> ::std::os::raw::c_void;
+}
+*/
 extern "C" {
     #[must_use]
     pub fn __hipRegisterManagedVar(

diff --git a/zluda_runtime/src/cudart.rs b/zluda_runtime/src/cudart.rs
@@ -3565,10 +3565,7 @@ pub unsafe extern "system" fn cudaGetDeviceProperties(
     prop: *mut cudaDeviceProp,
     device: ::std::os::raw::c_int,
 ) -> cudaError_t {
-    crate::get_device_properties(
-        prop,
-        device,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Returns information about the device\n\n Returns in \\p *value the integer value of the attribute \\p attr on device\n \\p device. The supported attributes are:\n - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block\n - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block\n - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block\n - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block\n - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid\n - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid\n - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid\n - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory\n   available to a thread block in bytes\n - ::cudaDevAttrTotalConstantMemory: Memory available on device for\n   __constant__ variables in a CUDA C kernel in bytes\n - ::cudaDevAttrWarpSize: Warp size in threads\n - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy\n   functions that involve memory regions allocated through ::cudaMallocPitch()\n - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width\n - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound\n   to linear memory\n - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width\n - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width\n - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height\n - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture\n   bound to linear memory\n - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture\n   bound to linear memory\n - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D\n   texture bound to linear memory\n - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture\n   width\n - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture\n   height\n - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width\n - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height\n - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth\n - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or\n   height\n - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width\n - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered\n   texture\n - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width\n - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height\n - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered\n   texture\n - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered\n   texture width or height\n - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap\n   layered texture\n - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width\n - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width\n - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height\n - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width\n - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height\n - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth\n - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width\n - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered\n   surface\n - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width\n - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height\n - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered\n   surface\n - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width\n - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered\n   surface width\n - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap\n   layered surface\n - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers\n   available to a thread block\n - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz\n - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base\n   addresses aligned to ::textureAlign bytes do not need an offset applied\n   to texture fetches\n - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D\n   texture references bound to pitched memory\n - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory\n   between host and device while executing a kernel, or 0 if not\n - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device\n - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels\n   executed on the device, or 0 if not\n - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory\n   subsystem, or 0 if not\n - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into\n   the CUDA address space, or 0 if not\n - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device\n   is currently in. Available modes are as follows:\n   - ::cudaComputeModeDefault: Default mode - Device is not restricted and\n     multiple threads can use ::cudaSetDevice() with this device.\n   - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will\n     be able to use ::cudaSetDevice() with this device.\n   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use\n     ::cudaSetDevice() with this device.\n   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many\n     threads in one process will be able to use ::cudaSetDevice() with this\n     device.\n - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing\n   multiple kernels within the same context simultaneously, or 0 if\n   not. It is not guaranteed that multiple kernels will be resident on the\n   device concurrently so this feature should not be relied upon for\n   correctness.\n - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,\n   0 if error correction is disabled or not supported by the device\n - ::cudaDevAttrPciBusId: PCI bus identifier of the device\n - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of\n   the device\n - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only\n   available on Tesla hardware running Windows Vista or later.\n - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz\n - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits\n - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device\n   doesn't have L2 cache.\n - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per\n   multiprocessor\n - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address\n   space with the host, or 0 if not\n - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version\n   number\n - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version\n   number\n - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream\n   priorities, or 0 if not\n - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals\n    in L1 cache, 0 if not\n - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals\n    in L1 cache, 0 if not\n - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory\n   available to a multiprocessor in bytes; this amount is shared by all\n   thread blocks simultaneously resident on a multiprocessor\n - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers\n   available to a multiprocessor; this number is shared by all thread blocks\n   simultaneously resident on a multiprocessor\n - ::cudaDevAttrManagedMemory: 1 if device supports allocating\n   managed memory, 0 if not\n - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not\n - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the\n   same multi-GPU board\n - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the\n   host supports native atomic operations\n - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance\n   (in floating-point operations per second) to double precision performance\n - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing\n   pageable memory without calling cudaHostRegister on it, and 0 otherwise\n - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed\n   memory concurrently with the CPU, and 0 otherwise\n - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports\n   Compute Preemption, 0 if not\n - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host\n   registered memory at the same virtual address as the CPU, and 0 otherwise\n - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels\n   via ::cudaLaunchCooperativeKernel, and 0 otherwise\n - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative\n   kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise\n - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding\n   remote writes, and 0 otherwise\n - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration\n   via ::cudaHostRegister, and 0 otherwise\n - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the\n   host's page tables, and 0 otherwise\n - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device\n   without migration, and 0 otherwise\n - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can\n   be opted into when using ::cudaFuncSetAttribute\n - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor\n - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes\n - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes\n - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes\n - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.\n - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly\n   to register memory that must be mapped as read-only to the GPU\n - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise\n - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise\n - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum\n - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values\n - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC\n\n \\param value  - Returned device attribute value\n \\param attr   - Device attribute to query\n \\param device - Device number to query\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidDevice,\n ::cudaErrorInvalidValue\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,\n ::cudaGetDeviceProperties,\n ::cuDeviceGetAttribute"]
@@ -7444,7 +7441,7 @@ pub unsafe extern "system" fn __cudaPopCallConfiguration(
     gridDim: *mut dim3,
     blockDim: *mut dim3,
     sharedMem: *mut usize,
-    stream: *mut ::std::os::raw::c_void,
+    stream: *mut cudaStream_t,
 ) -> cudaError_t {
     crate::pop_call_configuration(
         gridDim,
@@ -7459,8 +7456,8 @@ pub unsafe extern "system" fn __cudaPushCallConfiguration(
     gridDim: dim3,
     blockDim: dim3,
     sharedMem: usize,
-    stream: *mut ::std::os::raw::c_void,
-) -> ::std::os::raw::c_uint {
+    stream: cudaStream_t,
+) -> cudaError_t {
     crate::push_call_configuration(
         gridDim,
         blockDim,
@@ -7477,10 +7474,10 @@ pub unsafe extern "system" fn __cudaRegisterFatBinary(
 }
 
 #[no_mangle]
-pub extern "system" fn __cudaRegisterFatBinaryEnd(
+pub unsafe extern "system" fn __cudaRegisterFatBinaryEnd(
     fatCubinHandle: *mut *mut ::std::os::raw::c_void,
 ) -> () {
-
+    crate::register_fat_binary_end(fatCubinHandle)
 }
 
 #[no_mangle]