From 319a372b75b0530e7f4600bbdc34197db2bf420c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:17:00 -0800 Subject: [PATCH 1/9] merge with main for ruff --- cuda_core/cuda/core/experimental/_device.py | 12 +++++-- cuda_core/cuda/core/experimental/_memory.py | 37 ++++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0c03c789..a5cd4bc7 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -7,7 +7,7 @@ from cuda import cuda, cudart from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool +from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool from cuda.core.experimental._stream import Stream, StreamOptions, default_stream from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition @@ -62,7 +62,13 @@ def __new__(cls, device_id=None): for dev_id in range(total): dev = super().__new__(cls) dev._id = dev_id - dev._mr = _DefaultAsyncMempool(dev_id) + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the AsyncMemoryResource which does not use memory pools. + if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: + dev._mr = _AsyncMemoryResource(dev_id) + else: + dev._mr = _DefaultAsyncMempool(dev_id) + dev._has_inited = False _tls.devices.append(dev) @@ -70,7 +76,7 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: - raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?") + raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?") @property def device_id(self) -> int: diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 415b5151..50f8a260 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -42,7 +42,11 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ("_ptr", "_size", "_mr") + __slots__ = ( + "_ptr", + "_size", + "_mr", + ) def __init__(self, ptr, size, mr: MemoryResource = None): self._ptr = ptr @@ -286,3 +290,34 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +class _AsyncMemoryResource(MemoryResource): + __slots__ = ("_dev_id",) + + def __init__(self, dev_id): + self._handle = None + self._dev_id = dev_id + + def allocate(self, size, stream=None) -> Buffer: + if stream is None: + stream = default_stream() + ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle)) + return Buffer(ptr, size, self) + + def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return self._dev_id From 19e3a4f4b54a4b9562742d8575ad1f8ca7e6e0a7 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:18:04 -0800 Subject: [PATCH 2/9] fix tuple reformat --- cuda_core/cuda/core/experimental/_memory.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 50f8a260..26a9dd82 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -42,11 +42,7 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ( - "_ptr", - "_size", - "_mr", - ) + __slots__ = ("_ptr", "_size", "_mr") def __init__(self, ptr, size, mr: MemoryResource = None): self._ptr = ptr From 5e84da7cf888214ba940176a28089467f2afb055 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:18:45 -0800 Subject: [PATCH 3/9] fix tuple reformat --- cuda_core/cuda/core/experimental/_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index a5cd4bc7..a15eef36 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -76,7 +76,7 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: - raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?") + raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?") @property def device_id(self) -> int: From 122d25c01f4b8bbc02239bb1c2e58005c4bdb506 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 09:25:39 -0800 Subject: [PATCH 4/9] switch to sync alloc and free --- cuda_core/cuda/core/experimental/_device.py | 6 +++--- cuda_core/cuda/core/experimental/_memory.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index a15eef36..889c20a0 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -7,7 +7,7 @@ from cuda import cuda, cudart from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool +from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource from cuda.core.experimental._stream import Stream, StreamOptions, default_stream from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition @@ -63,9 +63,9 @@ def __new__(cls, device_id=None): dev = super().__new__(cls) dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the AsyncMemoryResource which does not use memory pools. + # use the SynchronousMemoryResource which does not use memory pools. if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: - dev._mr = _AsyncMemoryResource(dev_id) + dev._mr = _SynchronousMemoryResource(dev_id) else: dev._mr = _DefaultAsyncMempool(dev_id) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 26a9dd82..16dd97d7 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -288,7 +288,7 @@ def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") -class _AsyncMemoryResource(MemoryResource): +class _SynchronousMemoryResource(MemoryResource): __slots__ = ("_dev_id",) def __init__(self, dev_id): @@ -298,13 +298,13 @@ def __init__(self, dev_id): def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() - ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle)) + ptr = handle_return(cuda.cuMemAlloc(size, stream._handle)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): if stream is None: stream = default_stream() - handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + handle_return(cuda.cuMemFree(ptr, stream._handle)) @property def is_device_accessible(self) -> bool: From 27ec6d3dabd76238da0974c945c65b7c81ae7c22 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 13:26:01 -0800 Subject: [PATCH 5/9] add release notes --- cuda_core/docs/source/release.md | 1 + cuda_core/docs/source/release/0.1.1-notes.md | 0 2 files changed, 1 insertion(+) create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..55090b0b 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -5,5 +5,6 @@ maxdepth: 3 --- + 0.1.1 0.1.0 ``` diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md new file mode 100644 index 00000000..e69de29b From 42c4b45241f2c5f08ca96c7560788fec769ef1c0 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 13:28:58 -0800 Subject: [PATCH 6/9] make true the default path --- cuda_core/cuda/core/experimental/_device.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 889c20a0..88676cf6 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -64,10 +64,10 @@ def __new__(cls, device_id=None): dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: - dev._mr = _SynchronousMemoryResource(dev_id) - else: + if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1: dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) dev._has_inited = False _tls.devices.append(dev) From 64b1f22e9fae282739c6cf9aaf4005a0f289914b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 15:38:41 -0800 Subject: [PATCH 7/9] minor rewording --- cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index e69de29b..d80e6ef4 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -0,0 +1,7 @@ +# `cuda.core` Release notes + +Released on Dec X, 2024 + +## Hightlights +- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools + From 1d80ca70f1ef4d0435d4aa49ee97d9ec8b254588 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 08:48:38 -0800 Subject: [PATCH 8/9] fix some known issues before colossus test --- cuda_core/cuda/core/experimental/_device.py | 6 +++++- cuda_core/cuda/core/experimental/_memory.py | 8 ++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 88676cf6..db5f57cf 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -64,7 +64,11 @@ def __new__(cls, device_id=None): dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1: + if ( + handle_return( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + ) + ) == 1: dev._mr = _DefaultAsyncMempool(dev_id) else: dev._mr = _SynchronousMemoryResource(dev_id) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index ac6a78fe..5ff00ba2 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -303,15 +303,11 @@ def __init__(self, dev_id): self._dev_id = dev_id def allocate(self, size, stream=None) -> Buffer: - if stream is None: - stream = default_stream() - ptr = handle_return(cuda.cuMemAlloc(size, stream._handle)) + ptr = handle_return(cuda.cuMemAlloc(size)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): - if stream is None: - stream = default_stream() - handle_return(cuda.cuMemFree(ptr, stream._handle)) + handle_return(cuda.cuMemFree(ptr)) @property def is_device_accessible(self) -> bool: From b6d73c8ef4efb9ceb97c52f73cc4fb0a60d910c6 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 6 Dec 2024 11:13:14 -0800 Subject: [PATCH 9/9] Update cuda_core/cuda/core/experimental/_memory.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_memory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 5ff00ba2..12fafb39 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -307,6 +307,9 @@ def allocate(self, size, stream=None) -> Buffer: return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + stream.sync() handle_return(cuda.cuMemFree(ptr)) @property