NVIDIA · leofang · Dec 6, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -62,7 +62,13 @@ def __new__(cls, device_id=None):
                 for dev_id in range(total):
                     dev = super().__new__(cls)
                     dev._id = dev_id
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    # If the device is in TCC mode, or does not support memory pools for some other reason,
+                    # use the SynchronousMemoryResource which does not use memory pools.
+                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1:
+                        dev._mr = _DefaultAsyncMempool(dev_id)
+                    else:
+                        dev._mr = _SynchronousMemoryResource(dev_id)
+
                     dev._has_inited = False
                     _tls.devices.append(dev)
 

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
@@ -293,3 +293,34 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class _SynchronousMemoryResource(MemoryResource):
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, dev_id):
+        self._handle = None
+        self._dev_id = dev_id
+
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            stream = default_stream()
+        ptr = handle_return(cuda.cuMemAlloc(size, stream._handle))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        handle_return(cuda.cuMemFree(ptr, stream._handle))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -1,13 +1,15 @@
-# `cuda.core` Release notes
-
-Released on Dec XX, 2024
-
-## Hightlights
-- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
-  implementation of DLPack & CUDA Array Interface supports.
-
-
-## Limitations
-
-- All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+# `cuda.core` Release notes
+
+Released on Dec XX, 2024
+
+## Hightlights
+
+- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
+  implementation of DLPack & CUDA Array Interface supports.
+- Support GPUs running in the Windows TCC mode.
+
+
+## Limitations
+
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedbacks with us so that we can make `cuda.core` better!