Merge pull request #257 from NVIDIA/ksimpson/tcc_memory_resource

Add fallback memory resource for TCC devices
NVIDIA · Dec 6, 2024 · c6a1a94 · c6a1a94
2 parents 1c86afa + 57f7003
commit c6a1a94
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 3 deletions.
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -62,7 +62,17 @@ def __new__(cls, device_id=None):
                 for dev_id in range(total):
                     dev = super().__new__(cls)
                     dev._id = dev_id
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    # If the device is in TCC mode, or does not support memory pools for some other reason,
+                    # use the SynchronousMemoryResource which does not use memory pools.
+                    if (
+                        handle_return(
+                            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                        )
+                    ) == 1:
+                        dev._mr = _DefaultAsyncMempool(dev_id)
+                    else:
+                        dev._mr = _SynchronousMemoryResource(dev_id)
+
                     dev._has_inited = False
                     _tls.devices.append(dev)
 

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
@@ -293,3 +293,33 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class _SynchronousMemoryResource(MemoryResource):
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, dev_id):
+        self._handle = None
+        self._dev_id = dev_id
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(cuda.cuMemAlloc(size))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        stream.sync()
+        handle_return(cuda.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -5,7 +5,7 @@ Released on Dec XX, 2024
 ## Hightlights
 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
-
+- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools
 
 ## Limitations