From 6397752fe90493abd324981d9551383184ebb827 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Wed, 23 Oct 2024 14:04:00 -0700 Subject: [PATCH 01/13] Add docstrings to cuda.core for Device class Use NumPy Style Python Docstrings to maintain consistency with cuda-bindings. --- cuda_core/cuda/core/experimental/_device.py | 172 ++++++++++++++++++-- 1 file changed, 155 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 65d5fe9b..7e321b20 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -19,10 +19,37 @@ class Device: + """Represents a GPU and acts as an entry point for cuda.core features. + This is a singleton object that helps ensure interoperability + across multiple libraries imported in the process to both see + and use the same GPU device. + + While acting as the entry point, many other CUDA resources can be + allocated such as streams and buffers. Any :obj:`Context` dependent + resource created through this device, will continue to refer to + this device's context. + + """ __slots__ = ("_id", "_mr", "_has_inited") def __new__(cls, device_id=None): + """Create and return a singleton :obj:`Device` object. + + Creates and returns a thread-local singleton :obj:`Device` object + corresponding to a specific device. + + Note + ---- + Will not initialize the GPU. + + Parameters + ---------- + device_id : int, optional + Device ordinal to return a :obj:`Device` object for. + Default value of `None` return the currently used device. + + """ # important: creating a Device instance does not initialize the GPU! if device_id is None: device_id = handle_return(cudart.cudaGetDevice()) @@ -54,15 +81,24 @@ def _check_context_initialized(self, *args, **kwargs): @property def device_id(self) -> int: + """Returns device ordinal.""" return self._id @property def pci_bus_id(self) -> str: + """Returns a PCI Bus Id string for this device.""" bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) return bus_id[:12].decode() @property def uuid(self) -> str: + """Returns a UUID for the device. + + Returns 16-octets identifying the device. If the device is in + MIG mode, returns its MIG UUID which uniquely identifies the + subscribed MIG compute instance. + + """ driver_ver = handle_return(cuda.cuDriverGetVersion()) if driver_ver >= 11040: uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id)) @@ -74,19 +110,21 @@ def uuid(self) -> str: @property def name(self) -> str: - # assuming a GPU name is less than 128 characters... - name = handle_return(cuda.cuDeviceGetName(128, self._id)) + """Returns the device name.""" + # CUDA Runtime uses up to 256 characters, use the same for consistency + name = handle_return(cuda.cuDeviceGetName(256, self._id)) name = name.split(b'\0')[0] return name.decode() @property def properties(self) -> dict: + """Returns information about the compute-device.""" # TODO: pythonize the key names return handle_return(cudart.cudaGetDeviceProperties(self._id)) @property def compute_capability(self) -> ComputeCapability: - """Returns a named tuple with 2 fields: major and minor. """ + """Returns a named tuple with 2 fields: major and minor.""" major = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) minor = handle_return(cudart.cudaDeviceGetAttribute( @@ -96,12 +134,20 @@ def compute_capability(self) -> ComputeCapability: @property @precondition(_check_context_initialized) def context(self) -> Context: + """Returns the current :obj:`Context` associated with this device. + + Note + ---- + Device must be initialized. + + """ ctx = handle_return(cuda.cuCtxGetCurrent()) assert int(ctx) != 0 return Context._from_ctx(ctx, self._id) @property def memory_resource(self) -> MemoryResource: + """Returns :obj:`MemoryResource` associated with this device.""" return self._mr @memory_resource.setter @@ -112,27 +158,50 @@ def memory_resource(self, mr): @property def default_stream(self) -> Stream: + """Returns default CUDA :obj:`Stream` associated with this device. + + Returns per-thread default stream if environment is set with + CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream. + + """ return default_stream() def __int__(self): + """Returns device_id.""" return self._id def __repr__(self): return f"" def set_current(self, ctx: Context=None) -> Union[Context, None]: - """ - Entry point of this object. Users always start a code by + """Set device to be used for GPU executions. + + Initializes CUDA and sets the calling thread to a valid CUDA + context. By default the primary context is used, but optional `ctx` + parameter can be used to explicitly supply a :obj:`Context` object. + + Providing a `ctx` causes the previous set context to be popped and returned. + + Parameters + ---------- + ctx : Context, optional + Optional context to push onto this device's current thread stack + + Returns + ------- + Union[Context, None], optional + Popped context + + Examples + -------- + Acts as an entry point of this object. Users always start a code by calling this method, e.g. - + >>> from cuda.core.experimental import Device >>> dev0 = Device(0) >>> dev0.set_current() >>> # ... do work on device 0 ... - - The optional ctx argument is for advanced users to bind a - CUDA context with the device. In this case, the previously - set context is popped and returned to the user. + """ if ctx is not None: if not isinstance(ctx, Context): @@ -163,25 +232,94 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]: self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: - # Create a Context object (but do NOT set it current yet!). - # ContextOptions is a dataclass for setting e.g. affinity or CIG - # options. + """Create a new :obj:`Context` object. + + Note + ---- + The newly context will not be set as current. + + Parameters + ---------- + options : ContextOptions, optional + Customizable dataclass for context creation options + + Returns + ------- + :obj:`Context` + Newly created Context object + + """ raise NotImplementedError("TODO") @precondition(_check_context_initialized) def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: - # Create a Stream object by either holding a newly created - # CUDA stream or wrapping an existing foreign object supporting - # the __cuda_stream__ protocol. In the latter case, a reference - # to obj is held internally so that its lifetime is managed. + """Create a Stream object. + + New stream objects can be created in two different ways: + + 1) Create a new CUDA stream with customizable `options`. + 2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol. + + Option (2) internally holds a reference to the foreign object + such that the lifetime is managed. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + obj : Any, optional + Any object supporting the __cuda_stream__ protocol. + options : :obj:`StreamOptions`, optional + Customizable dataclass for stream creation options + + Returns + ------- + :obj:`Stream` + Newly created Stream object + + """ return Stream._init(obj=obj, options=options) @precondition(_check_context_initialized) def allocate(self, size, stream=None) -> Buffer: + """Allocate device memory from a specified stream. + + Allocates device memory of `size` bytes on the specified `stream` + using the memory resource currently associated with this Device. + + Parameter `stream` is optional, using a default stream by default. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + size : int + Number of bytes to allocate. + stream : :obj:`Stream`, optional + The stream establishing the stream ordering semantic. + Default value of `None` uses default stream. + + Returns + ------- + :obj:`Buffer` + Newly created Buffer object + + """ if stream is None: stream = default_stream() return self._mr.allocate(size, stream) @precondition(_check_context_initialized) def sync(self): + """Synchronize the device. + + Note + ---- + Device must be initialized. + + """ handle_return(cudart.cudaDeviceSynchronize()) From 9e9816084b2db9659d06fdae8035c57343caee8c Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Wed, 23 Oct 2024 14:06:32 -0700 Subject: [PATCH 02/13] Use "Return" wording to be consistent with default --- cuda_core/cuda/core/experimental/_device.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 7e321b20..9380c80b 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -81,18 +81,18 @@ def _check_context_initialized(self, *args, **kwargs): @property def device_id(self) -> int: - """Returns device ordinal.""" + """Return device ordinal.""" return self._id @property def pci_bus_id(self) -> str: - """Returns a PCI Bus Id string for this device.""" + """Return a PCI Bus Id string for this device.""" bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) return bus_id[:12].decode() @property def uuid(self) -> str: - """Returns a UUID for the device. + """Return a UUID for the device. Returns 16-octets identifying the device. If the device is in MIG mode, returns its MIG UUID which uniquely identifies the @@ -110,7 +110,7 @@ def uuid(self) -> str: @property def name(self) -> str: - """Returns the device name.""" + """Return the device name.""" # CUDA Runtime uses up to 256 characters, use the same for consistency name = handle_return(cuda.cuDeviceGetName(256, self._id)) name = name.split(b'\0')[0] @@ -118,13 +118,13 @@ def name(self) -> str: @property def properties(self) -> dict: - """Returns information about the compute-device.""" + """Return information about the compute-device.""" # TODO: pythonize the key names return handle_return(cudart.cudaGetDeviceProperties(self._id)) @property def compute_capability(self) -> ComputeCapability: - """Returns a named tuple with 2 fields: major and minor.""" + """Return a named tuple with 2 fields: major and minor.""" major = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) minor = handle_return(cudart.cudaDeviceGetAttribute( @@ -134,7 +134,7 @@ def compute_capability(self) -> ComputeCapability: @property @precondition(_check_context_initialized) def context(self) -> Context: - """Returns the current :obj:`Context` associated with this device. + """Return the current :obj:`Context` associated with this device. Note ---- @@ -147,7 +147,7 @@ def context(self) -> Context: @property def memory_resource(self) -> MemoryResource: - """Returns :obj:`MemoryResource` associated with this device.""" + """Return :obj:`MemoryResource` associated with this device.""" return self._mr @memory_resource.setter @@ -158,7 +158,7 @@ def memory_resource(self, mr): @property def default_stream(self) -> Stream: - """Returns default CUDA :obj:`Stream` associated with this device. + """Return default CUDA :obj:`Stream` associated with this device. Returns per-thread default stream if environment is set with CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream. @@ -167,7 +167,7 @@ def default_stream(self) -> Stream: return default_stream() def __int__(self): - """Returns device_id.""" + """Return device_id.""" return self._id def __repr__(self): From f73876d9ebc23ca930bc687089a76ad5d3174ab6 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Wed, 23 Oct 2024 17:09:41 -0700 Subject: [PATCH 03/13] Add docstrings to cuda.core for Event and EventOptions class --- cuda_core/cuda/core/experimental/_event.py | 56 ++++++++++++++++++---- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 3c85d9fe..36c76558 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -13,17 +13,48 @@ @dataclass class EventOptions: + """Customizable :obj:`Event` options. + + Attributes + ---------- + enable_timing : bool, optional + Event will record timing data. (Default to False) + busy_waited_sync : bool, optional + If True, event will use blocking synchronization. When a CPU + thread calls synchronize, the call will block until the event + has actually been completed. + Otherwise, the CPU thread will busy-wait until the event has + been completed. (Default to False) + support_ipc : bool, optional + Event will be suitable for interprocess use. + Note that enable_timing must be False. (Default to False) + + """ enable_timing: Optional[bool] = False busy_waited_sync: Optional[bool] = False support_ipc: Optional[bool] = False class Event: + """Represents a record of a specific point of execution within a CUDA stream. + Applications can asynchronously record events at any point in + the program. An event keeps a record of all previous work withinq + the last recorded stream. + + Events can be used to monitor device's progress, query completion + of work up to event's record, and help establish dependencies + between GPU work submissions. + + """ __slots__ = ("_handle", "_timing_disabled", "_busy_waited") def __init__(self): - # minimal requirements for the destructor + """Unsupported function due to ambiguity. + + New events should instead be created through a :obj:`Stream` object. + + """ self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call " @@ -51,37 +82,45 @@ def _init(options: Optional[EventOptions]=None): return self def __del__(self): + """Destroys the event.""" self.close() def close(self): - # Destroy the event. + """Destroys the event.""" if self._handle: handle_return(cuda.cuEventDestroy(self._handle)) self._handle = None @property def is_timing_disabled(self) -> bool: - # Check if this instance can be used for the timing purpose. + """Return True if the event does not record timing data, otherwise False.""" return self._timing_disabled @property def is_sync_busy_waited(self) -> bool: - # Check if the event synchronization would keep the CPU busy-waiting. + """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False.""" return self._busy_waited @property def is_ipc_supported(self) -> bool: - # Check if this instance can be used for IPC. + """Return True if this event can be used as an interprocess event, otherwise False.""" raise NotImplementedError("TODO") def sync(self): - # Sync over the event. + """Synchronize until the event completes. + + If the event was created with busy_waited_sync, then the + calling CPU thread will block until the event has been + completed by the device. + Otherwise the CPU thread will busy-wait until the event + has been completed. + + """ handle_return(cuda.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: - # Return True if all captured works have been completed, - # otherwise False. + """Return True if all captured works have been completed, otherwise False.""" result, = cuda.cuEventQuery(self._handle) if result == cuda.CUresult.CUDA_SUCCESS: return True @@ -92,4 +131,5 @@ def is_done(self) -> bool: @property def handle(self) -> int: + """Return event memory address.""" return int(self._handle) From e739e92cd022b83498b662632c41e6799134afc0 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Wed, 23 Oct 2024 17:12:04 -0700 Subject: [PATCH 04/13] Fix typo --- cuda_core/cuda/core/experimental/_event.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 36c76558..83b0dfdb 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -39,7 +39,7 @@ class Event: """Represents a record of a specific point of execution within a CUDA stream. Applications can asynchronously record events at any point in - the program. An event keeps a record of all previous work withinq + the program. An event keeps a record of all previous work within the last recorded stream. Events can be used to monitor device's progress, query completion From 56efb3988506a96fb6ff105de8096c8d840e8e56 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Mon, 28 Oct 2024 15:20:13 -0700 Subject: [PATCH 05/13] Push saved work for device event and stream --- cuda_core/cuda/core/experimental/_device.py | 27 +++-- cuda_core/cuda/core/experimental/_event.py | 4 +- cuda_core/cuda/core/experimental/_stream.py | 127 +++++++++++++++++--- 3 files changed, 127 insertions(+), 31 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 9380c80b..f91d3457 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -160,8 +160,11 @@ def memory_resource(self, mr): def default_stream(self) -> Stream: """Return default CUDA :obj:`Stream` associated with this device. - Returns per-thread default stream if environment is set with - CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream. + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. """ return default_stream() @@ -184,13 +187,13 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]: Parameters ---------- - ctx : Context, optional - Optional context to push onto this device's current thread stack + ctx : :obj:`Context`, optional + Optional context to push onto this device's current thread stack. Returns ------- - Union[Context, None], optional - Popped context + Union[:obj:`Context`, None], optional + Popped context. Examples -------- @@ -240,13 +243,13 @@ def create_context(self, options: ContextOptions = None) -> Context: Parameters ---------- - options : ContextOptions, optional - Customizable dataclass for context creation options + options : :obj:`ContextOptions`, optional + Customizable dataclass for context creation options. Returns ------- :obj:`Context` - Newly created Context object + Newly created Context object. """ raise NotImplementedError("TODO") @@ -272,12 +275,12 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: obj : Any, optional Any object supporting the __cuda_stream__ protocol. options : :obj:`StreamOptions`, optional - Customizable dataclass for stream creation options + Customizable dataclass for stream creation options. Returns ------- :obj:`Stream` - Newly created Stream object + Newly created stream object. """ return Stream._init(obj=obj, options=options) @@ -306,7 +309,7 @@ def allocate(self, size, stream=None) -> Buffer: Returns ------- :obj:`Buffer` - Newly created Buffer object + Newly created Buffer object. """ if stream is None: diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 83b0dfdb..4805e093 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -82,7 +82,7 @@ def _init(options: Optional[EventOptions]=None): return self def __del__(self): - """Destroys the event.""" + """Return close(self)""" self.close() def close(self): @@ -131,5 +131,5 @@ def is_done(self) -> bool: @property def handle(self) -> int: - """Return event memory address.""" + """Return the underlying cudaEvent_t pointer address as Python int.""" return int(self._handle) diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 95f8ec50..e915f01f 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -20,17 +20,45 @@ @dataclass class StreamOptions: + """Customizable :obj:`Stream` options. + Attributes + ---------- + nonblocking : bool, optional + Stream does not synchronize with the NULL stream. (Default to True) + priority : int, optional + Stream priority where lower number represents a + higher priority. (Default to highest priority) + + """ nonblocking: bool = True priority: Optional[int] = None class Stream: + """Represents a queue of GPU operations that are executed in a specific order. + + Applications use streams to control the order of execution for + GPU work. Work within a single stream are executed sequentially. + Whereas work across multiple streams can be further controlled + using stream priorities and :obj:`Event` managements. + + Advanced users can utilize default streams for enforce complex + implicit synchronization behaviors. + + """ __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", "_device_id", "_ctx_handle") def __init__(self): + """Unsupported function due to ambiguity. + + New streams should instead be created through a :obj:`Device` + object, or created directly through using an existing handle + using Stream.from_handle() + + """ # minimal requirements for the destructor self._handle = None self._owner = None @@ -74,13 +102,12 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): else: flags = cuda.CUstream_flags.CU_STREAM_DEFAULT + high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange()) if priority is not None: - high, low = handle_return( - cudart.cudaDeviceGetStreamPriorityRange()) if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: - priority = 0 + priority = high self._handle = handle_return( cuda.cuStreamCreateWithPriority(flags, priority)) @@ -94,9 +121,16 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): return self def __del__(self): + """Return close(self).""" self.close() def close(self): + """Destroy the stream. + + Destroys the stream if we own it. Borrowed foreign stream + object will instead have their references released. + + """ if self._owner is None: if self._handle and not self._builtin: handle_return(cuda.cuStreamDestroy(self._handle)) @@ -106,15 +140,17 @@ def close(self): @property def __cuda_stream__(self) -> Tuple[int, int]: + """Return an instance of a __cuda_stream__ protocol.""" return (0, int(self._handle)) @property def handle(self) -> int: - # Return the underlying cudaStream_t pointer address as Python int. + """Return the underlying cudaStream_t pointer address as Python int.""" return int(self._handle) @property def is_nonblocking(self) -> bool: + """Return True if this is a nonblocking stream, otherwise False.""" if self._nonblocking is None: flag = handle_return(cuda.cuStreamGetFlags(self._handle)) if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING: @@ -125,15 +161,35 @@ def is_nonblocking(self) -> bool: @property def priority(self) -> int: + """Return the stream priority.""" if self._priority is None: prio = handle_return(cuda.cuStreamGetPriority(self._handle)) self._priority = prio return self._priority def sync(self): + """Synchronize the stream.""" handle_return(cuda.cuStreamSynchronize(self._handle)) def record(self, event: Event=None, options: EventOptions=None) -> Event: + """Record an event onto the stream. + + Creates an Event object (or reuses the given one) by + recording on the stream. + + Parameters + ---------- + event : :obj:`Event`, optional + Optional event object to be reused for recording. + options : :obj:`EventOptions`, optional + Customizable dataclass for event creation options. + + Returns + ------- + :obj:`Event` + Newly created event object. + + """ # Create an Event object (or reusing the given one) by recording # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. @@ -145,11 +201,15 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event: return event def wait(self, event_or_stream: Union[Event, Stream]): - # Wait for a CUDA event or a CUDA stream to establish a stream order. - # - # If a Stream instance is provided, the effect is as if an event is - # recorded on the given stream, and then self waits on the recorded - # event. + """Wait for a CUDA event or a CUDA stream. + + Waiting for an event or a stream establishes a stream order. + + If a :obj:`Stream` is provided, then wait until the stream's + work is completed. This is done by recording a new :obj:`Event` + on the stream and then waiting on it. + + """ if isinstance(event_or_stream, Event): event = event_or_stream.handle discard_event = False @@ -175,12 +235,15 @@ def wait(self, event_or_stream: Union[Event, Stream]): @property def device(self) -> Device: - # Inverse look-up to find on which device this stream instance was - # created. - # - # Note that Stream.device.context might not necessarily agree with - # Stream.context, in cases where a different CUDA context is set - # current after a stream was created. + """Return :obj:`Device` singleton associated with this stream. + + Note + ---- + The current context on the device may differ from this + stream's context. This case occurs when a different CUDA + context is set current after a stream is created. + + """ from cuda.core.experimental._device import Device # avoid circular import if self._device_id is None: # Get the stream context first @@ -192,8 +255,7 @@ def device(self) -> Device: @property def context(self) -> Context: - # Inverse look-up to find in which CUDA context this stream instance - # was created + """Return the :obj:`Context` associated with this stream.""" if self._ctx_handle is None: self._ctx_handle = handle_return( cuda.cuStreamGetCtx(self._handle)) @@ -203,6 +265,28 @@ def context(self) -> Context: @staticmethod def from_handle(handle: int) -> Stream: + """Create a new :obj:`Stream` object from a foreign stream handle. + + Uses a cudaStream_t pointer address represented as a Python int + to create a new :obj:`Stream` object. + + Note + ---- + Stream lifetime is not managed, foreign object must remain + alive while this steam is active. + + Parameters + ---------- + handle : int + Stream handle representing the address of a foreign + stream object. + + Returns + ------- + :obj:`Stream` + Newly created stream object. + + """ class _stream_holder: @property def __cuda_stream__(self): @@ -235,6 +319,15 @@ def __init__(self): def default_stream(): + """Return the default CUDA :obj:`Stream`. + + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. + + """ # TODO: flip the default use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0)) if use_ptds: From b5ec7ba225feb6846acb2e5731ffb11b52d5a175 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Tue, 29 Oct 2024 08:33:38 -0700 Subject: [PATCH 06/13] Correct wording of what default priority is --- cuda_core/cuda/core/experimental/_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index e915f01f..c607adea 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -28,7 +28,7 @@ class StreamOptions: Stream does not synchronize with the NULL stream. (Default to True) priority : int, optional Stream priority where lower number represents a - higher priority. (Default to highest priority) + higher priority. (Default to lowest priority) """ nonblocking: bool = True From 6b4b6e0e49154e5f91143419d3d3760f11bbb1c4 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Tue, 29 Oct 2024 10:32:03 -0700 Subject: [PATCH 07/13] Add docstrings to Buffer --- cuda_core/cuda/core/experimental/_memory.py | 82 ++++++++++++++++++--- 1 file changed, 71 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 4ef2cbc3..99d1e129 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -22,19 +22,56 @@ class Buffer: + """Represents a handle to allocated memory. + + This generic object provides a unified representation for how + different memory resources are to give access to their memory + allocations. + + Support for data interchange mechanisms are provided by + establishing both the DLPack and the Python-level buffer + protocols. + + """ # TODO: handle ownership? (_mr could be None) __slots__ = ("_ptr", "_size", "_mr",) def __init__(self, ptr, size, mr: MemoryResource=None): + """Initialize a new buffer object. + + Parameters + ---------- + ptr : Any + Allocated buffer handle object + size : Any + Memory size of the buffer + mr : :obj:`MemoryResource`, optional + Memory resource associated with the buffer + + """ self._ptr = ptr self._size = size self._mr = mr def __del__(self): - self.close(default_stream()) + """Return close(self).""" + self.close() def close(self, stream=None): + """Deallocate this buffer asynchronously on the given stream. + + This buffer is released back to their memory resource + asynchronously on the given stream. + + Parameters + ---------- + stream : Any, optional + The stream object with a __cuda_stream__ protocol to + use for asynchronous deallocation. Defaults to using + the default stream. + + """ if self._ptr and self._mr is not None: if stream is None: stream = default_stream() @@ -44,42 +81,56 @@ def close(self, stream=None): @property def handle(self): + """Return buffer handle object.""" return self._ptr @property def size(self): + """Return the memory size of this buffer.""" return self._size @property def memory_resource(self) -> MemoryResource: - # Return the memory resource from which this buffer was allocated. + """Return the memory resource associated with this buffer.""" return self._mr @property def is_device_accessible(self) -> bool: - # Check if this buffer can be accessed from GPUs. + """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._mr is not None: return self._mr.is_device_accessible raise NotImplementedError @property def is_host_accessible(self) -> bool: - # Check if this buffer can be accessed from CPUs. + """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._mr is not None: return self._mr.is_host_accessible raise NotImplementedError @property def device_id(self) -> int: + """Return the device ordinal of this buffer.""" if self._mr is not None: return self._mr.device_id raise NotImplementedError def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: - # Copy from this buffer to the dst buffer asynchronously on the - # given stream. The dst buffer is returned. If the dst is not provided, - # allocate one from self.memory_resource. Raise an exception if the - # stream is not provided. + """Copy from this buffer to the dst buffer asynchronously on the given stream. + + Copies the data from this buffer to the provided dst buffer. + If the dst buffer is not provided, then a new buffer is first + allocated using the associated memory resource before the copy. + + Parameters + ---------- + dst : :obj:`Buffer` + Source buffer to copy data from + stream : Any + Keyword argument specifying the stream for the + asynchronous copy + + """ if stream is None: raise ValueError("stream must be provided") if dst is None: @@ -93,8 +144,17 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: return dst def copy_from(self, src: Buffer, *, stream): - # Copy from the src buffer to this buffer asynchronously on the - # given stream. Raise an exception if the stream is not provided. + """Copy from the src buffer to this buffer asynchronously on the given stream. + + Parameters + ---------- + src : :obj:`Buffer` + Source buffer to copy data from + stream : Any + Keyword argument specifying the stream for the + asynchronous copy + + """ if stream is None: raise ValueError("stream must be provided") if src._size != self._size: @@ -141,7 +201,7 @@ def __buffer__(self, flags: int, /) -> memoryview: raise NotImplementedError("TODO") def __release_buffer__(self, buffer: memoryview, /): - # Supporting methond paired with __buffer__. + # Supporting method paired with __buffer__. raise NotImplementedError("TODO") From 23c194168fad141e5649941b685e2d93decbb9ee Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Tue, 29 Oct 2024 10:45:47 -0700 Subject: [PATCH 08/13] Add comment about MIG UUID on old CUDA versions --- cuda_core/cuda/core/experimental/_device.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index f91d3457..8e8f7e31 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -98,6 +98,11 @@ def uuid(self) -> str: MIG mode, returns its MIG UUID which uniquely identifies the subscribed MIG compute instance. + Note + ---- + MIG UUID is only returned when device is in MIG mode and the + driver is older than CUDA 11.4. + """ driver_ver = handle_return(cuda.cuDriverGetVersion()) if driver_ver >= 11040: From b411fa6dbeb167bc91fe54c2119957062d0a4522 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Wed, 30 Oct 2024 14:55:27 -0700 Subject: [PATCH 09/13] Add docstring to more classes and functions Classes: - Program - ObjectCode - Kernel - LaunchConfig Functions: - launch --- cuda_core/cuda/core/experimental/_device.py | 4 +- cuda_core/cuda/core/experimental/_launcher.py | 33 ++++++++++- cuda_core/cuda/core/experimental/_memory.py | 2 +- cuda_core/cuda/core/experimental/_module.py | 57 +++++++++++++++++++ cuda_core/cuda/core/experimental/_program.py | 50 ++++++++++++++++ cuda_core/cuda/core/experimental/_stream.py | 2 +- 6 files changed, 143 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 8e8f7e31..785f2ed1 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -254,7 +254,7 @@ def create_context(self, options: ContextOptions = None) -> Context: Returns ------- :obj:`Context` - Newly created Context object. + Newly created context object. """ raise NotImplementedError("TODO") @@ -314,7 +314,7 @@ def allocate(self, size, stream=None) -> Buffer: Returns ------- :obj:`Buffer` - Newly created Buffer object. + Newly created buffer object. """ if stream is None: diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 4b9533cb..a3250227 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -37,7 +37,23 @@ def _lazy_init(): @dataclass class LaunchConfig: - """ + """Customizable launch options. + + Attributes + ---------- + grid : Union[tuple, int] + Collection of threads that will execute a kernel function. + block : Union[tuple, int] + Group of threads (Thread Block) that will execute on the same + multiprocessor. Threads within a thread blocks have access to + shared memory and can be explicitly synchronized. + stream : :obj:`Stream` + The stream establishing the stream ordering semantic of a + launch. + shmem_size : int, optional + Dynamic shared-memory size per thread block in bytes. + (Default to size 0) + """ # TODO: expand LaunchConfig to include other attributes grid: Union[tuple, int] = None @@ -87,6 +103,21 @@ def _cast_to_3_tuple(self, cfg): def launch(kernel, config, *kernel_args): + """Launches a :obj:`Kernel` object with launch-time configuration. + + Invokes a :obj:`Kernel` object with specified launch-time + configurations. + + Parameters + ---------- + config : Any + Launch configurations inline with options provided by + :obj:`LaunchConfig` dataclass. + *kernel_args : Any + Variable length argument list that is provided to the + launching kernel. + + """ if not isinstance(kernel, Kernel): raise ValueError config = check_or_create_options(LaunchConfig, config, "launch config") diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 99d1e129..3eeea660 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -81,7 +81,7 @@ def close(self, stream=None): @property def handle(self): - """Return buffer handle object.""" + """Return the buffer handle object.""" return self._ptr @property diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 60d4db97..0b9e450c 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -46,10 +46,17 @@ def _lazy_init(): class Kernel: + """Represents a compiled kernel that had been loaded onto the device. + + Kernel instances can execution when passed directly into a + launch function. + + """ __slots__ = ("_handle", "_module",) def __init__(self): + """Unsupported function whose creation is intended through an :obj:`ObjectCode` object.""" raise NotImplementedError("directly constructing a Kernel instance is not supported") @staticmethod @@ -65,12 +72,49 @@ def _from_obj(obj, mod): class ObjectCode: + """Represents the compiled program loaded onto the device. + + This object provides a unified interface for different types of + compiled programs that are loaded onto the device. + + """ __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "fatbin") def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): + """Create and return a compiled program as an instance of an :obj:`ObjectCode`. + + Loads the module library with specified module code and JIT options. + + Note + ---- + Usage under CUDA 11.x will only load to the current device + context. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the module to load, or + a file path string containing that module for loading. + code_type : Any + String of the compiled type. + Supported options are "ptx", "cubin" and "ltoir". + jit_options : Optional + Mapping of JIT options to use during module loading. + (Default to no options) + symbol_mapping : Optional + Keyword argument dictionary specifying how symbol names + should be mapped before trying to retrieve them. + (Default to no mappings) + + Returns + ------- + :obj:`ObjectCode` + Newly created :obj:`ObjectCode`. + + """ if code_type not in self._supported_code_type: raise ValueError _lazy_init() @@ -107,6 +151,19 @@ def __del__(self): pass def get_kernel(self, name): + """Return the :obj:`Kernel` of a specified name from this object code. + + Parameters + ---------- + name : Any + Name of the kernel to retrieve. + + Returns + ------- + :obj:`Kernel` + Newly created kernel object. + + """ try: name = self._sym_map[name] except KeyError: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index ec0778a3..2f4f974f 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -8,12 +8,34 @@ class Program: + """Represents the compilation machinery for processing programs into :obj:`ObjectCode`. + + This object provides a unified interface to multiple underlying + compiler libraries. Compilation support is enabled for a wide + range of code types and compilation types. + + """ __slots__ = ("_handle", "_backend", ) _supported_code_type = ("c++", ) _supported_target_type = ("ptx", "cubin", "ltoir", ) def __init__(self, code, code_type): + """Create an instance of a :obj:`Program` object. + + Parameters + ---------- + code : Any + String of the CUDA Runtime Compilation program. + code_type : Any + String of the code type. Only "c++" is currently supported. + + Returns + ------- + :obj:`Program` + Newly created program object. + + """ self._handle = None if code_type not in self._supported_code_type: raise NotImplementedError @@ -30,14 +52,40 @@ def __init__(self, code, code_type): raise NotImplementedError def __del__(self): + """Return close(self).""" self.close() def close(self): + """Destroys this program.""" if self._handle is not None: handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) self._handle = None def compile(self, target_type, options=(), name_expressions=(), logs=None): + """Compile the program with a specific compilation type. + + Parameters + ---------- + target_type : Any + String of the targeted compilation type. + Supported options are "ptx", "cubin" and "ltoir". + options : Union[List, Tuple], optional + List of compilation options associated with the backend + of this :obj:`Program`. (Default to no options) + name_expressions : Union[List, Tuple], optional + List of explicit name expressions to become accessible. + (Default to no expressions) + logs : Any, optional + Object with a write method to receive the logs generated + from compilation. + (Default to no logs) + + Returns + ------- + :obj:`ObjectCode` + Newly created code object. + + """ if target_type not in self._supported_target_type: raise NotImplementedError @@ -80,8 +128,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): @property def backend(self): + """Return the backend type string associated with this program.""" return self._backend @property def handle(self): + """Return the program handle object.""" return self._handle diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index c607adea..eb4f03a0 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -235,7 +235,7 @@ def wait(self, event_or_stream: Union[Event, Stream]): @property def device(self) -> Device: - """Return :obj:`Device` singleton associated with this stream. + """Return the :obj:`Device` singleton associated with this stream. Note ---- From 7497a938131979a724deda99b190f6362d71db6d Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Thu, 31 Oct 2024 12:47:54 -0700 Subject: [PATCH 10/13] Naming consistency and rewording --- cuda_core/cuda/core/experimental/_device.py | 2 +- cuda_core/cuda/core/experimental/_event.py | 2 +- cuda_core/cuda/core/experimental/_program.py | 2 +- cuda_core/cuda/core/experimental/_stream.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 785f2ed1..7890ae53 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -116,7 +116,7 @@ def uuid(self) -> str: @property def name(self) -> str: """Return the device name.""" - # CUDA Runtime uses up to 256 characters, use the same for consistency + # Use 256 characters to be consistent with CUDA Runtime name = handle_return(cuda.cuDeviceGetName(256, self._id)) name = name.split(b'\0')[0] return name.decode() diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 4805e093..a1b03974 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -86,7 +86,7 @@ def __del__(self): self.close() def close(self): - """Destroys the event.""" + """Destroy the event.""" if self._handle: handle_return(cuda.cuEventDestroy(self._handle)) self._handle = None diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 2f4f974f..9123fdee 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -56,7 +56,7 @@ def __del__(self): self.close() def close(self): - """Destroys this program.""" + """Destroy this program.""" if self._handle is not None: handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) self._handle = None diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index eb4f03a0..01f4fd1f 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -127,7 +127,7 @@ def __del__(self): def close(self): """Destroy the stream. - Destroys the stream if we own it. Borrowed foreign stream + Destroy the stream if we own it. Borrowed foreign stream object will instead have their references released. """ From cc3b2fd15d76c2224ad8fc6a696cec977860d2ac Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Fri, 1 Nov 2024 12:42:00 -0700 Subject: [PATCH 11/13] Refer to the correct ObjectCode supported types --- cuda_core/cuda/core/experimental/_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 0b9e450c..b7802633 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -100,7 +100,7 @@ def __init__(self, module, code_type, jit_options=None, *, a file path string containing that module for loading. code_type : Any String of the compiled type. - Supported options are "ptx", "cubin" and "ltoir". + Supported options are "ptx", "cubin" and "fatbin". jit_options : Optional Mapping of JIT options to use during module loading. (Default to no options) From f3e746976978a30781b28259132e439ced00dff2 Mon Sep 17 00:00:00 2001 From: Vladislav Zhurba Date: Mon, 4 Nov 2024 12:54:49 -0800 Subject: [PATCH 12/13] Move init doc to class with a cleanup --- cuda_core/cuda/core/experimental/_device.py | 31 +++++----- cuda_core/cuda/core/experimental/_event.py | 10 ++- cuda_core/cuda/core/experimental/_launcher.py | 2 + cuda_core/cuda/core/experimental/_memory.py | 23 +++---- cuda_core/cuda/core/experimental/_module.py | 62 +++++++++---------- cuda_core/cuda/core/experimental/_program.py | 24 +++---- cuda_core/cuda/core/experimental/_stream.py | 14 ++--- 7 files changed, 72 insertions(+), 94 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 7890ae53..2899282c 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -19,7 +19,7 @@ class Device: - """Represents a GPU and acts as an entry point for cuda.core features. + """Represent a GPU and act as an entry point for cuda.core features. This is a singleton object that helps ensure interoperability across multiple libraries imported in the process to both see @@ -30,26 +30,23 @@ class Device: resource created through this device, will continue to refer to this device's context. - """ - __slots__ = ("_id", "_mr", "_has_inited") - - def __new__(cls, device_id=None): - """Create and return a singleton :obj:`Device` object. + Newly returend :obj:`Device` object are is a thread-local singleton + for a specified device. - Creates and returns a thread-local singleton :obj:`Device` object - corresponding to a specific device. + Note + ---- + Will not initialize the GPU. - Note - ---- - Will not initialize the GPU. + Parameters + ---------- + device_id : int, optional + Device ordinal to return a :obj:`Device` object for. + Default value of `None` return the currently used device. - Parameters - ---------- - device_id : int, optional - Device ordinal to return a :obj:`Device` object for. - Default value of `None` return the currently used device. + """ + __slots__ = ("_id", "_mr", "_has_inited") - """ + def __new__(cls, device_id=None): # important: creating a Device instance does not initialize the GPU! if device_id is None: device_id = handle_return(cudart.cudaGetDevice()) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index a1b03974..a6d5da28 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -36,7 +36,7 @@ class EventOptions: class Event: - """Represents a record of a specific point of execution within a CUDA stream. + """Represent a record at a specific point of execution within a CUDA stream. Applications can asynchronously record events at any point in the program. An event keeps a record of all previous work within @@ -46,15 +46,13 @@ class Event: of work up to event's record, and help establish dependencies between GPU work submissions. + Directly creating an :obj:`Event` is not supported due to ambiguity, + and they should instead be created through a :obj:`Stream` object. + """ __slots__ = ("_handle", "_timing_disabled", "_busy_waited") def __init__(self): - """Unsupported function due to ambiguity. - - New events should instead be created through a :obj:`Stream` object. - - """ self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call " diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index a3250227..2787a718 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -110,6 +110,8 @@ def launch(kernel, config, *kernel_args): Parameters ---------- + kernel : :obj:`Kernel` + Kernel to launch. config : Any Launch configurations inline with options provided by :obj:`LaunchConfig` dataclass. diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 3eeea660..678f26ee 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -22,7 +22,7 @@ class Buffer: - """Represents a handle to allocated memory. + """Represent a handle to allocated memory. This generic object provides a unified representation for how different memory resources are to give access to their memory @@ -32,24 +32,21 @@ class Buffer: establishing both the DLPack and the Python-level buffer protocols. + Parameters + ---------- + ptr : Any + Allocated buffer handle object + size : Any + Memory size of the buffer + mr : :obj:`MemoryResource`, optional + Memory resource associated with the buffer + """ # TODO: handle ownership? (_mr could be None) __slots__ = ("_ptr", "_size", "_mr",) def __init__(self, ptr, size, mr: MemoryResource=None): - """Initialize a new buffer object. - - Parameters - ---------- - ptr : Any - Allocated buffer handle object - size : Any - Memory size of the buffer - mr : :obj:`MemoryResource`, optional - Memory resource associated with the buffer - - """ self._ptr = ptr self._size = size self._mr = mr diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index b7802633..b587d813 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -46,17 +46,19 @@ def _lazy_init(): class Kernel: - """Represents a compiled kernel that had been loaded onto the device. + """Represent a compiled kernel that had been loaded onto the device. Kernel instances can execution when passed directly into a launch function. + Directly creating a :obj:`Kernel` is not supported, and they + should instead be created through a :obj:`ObjectCode` object. + """ __slots__ = ("_handle", "_module",) def __init__(self): - """Unsupported function whose creation is intended through an :obj:`ObjectCode` object.""" raise NotImplementedError("directly constructing a Kernel instance is not supported") @staticmethod @@ -72,11 +74,34 @@ def _from_obj(obj, mod): class ObjectCode: - """Represents the compiled program loaded onto the device. + """Represent a compiled program that was loaded onto the device. This object provides a unified interface for different types of compiled programs that are loaded onto the device. + Loads the module library with specified module code and JIT options. + + Note + ---- + Usage under CUDA 11.x will only load to the current device + context. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the module to load, or + a file path string containing that module for loading. + code_type : Any + String of the compiled type. + Supported options are "ptx", "cubin" and "fatbin". + jit_options : Optional + Mapping of JIT options to use during module loading. + (Default to no options) + symbol_mapping : Optional + Keyword argument dictionary specifying how symbol names + should be mapped before trying to retrieve them. + (Default to no mappings) + """ __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") @@ -84,37 +109,6 @@ class ObjectCode: def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): - """Create and return a compiled program as an instance of an :obj:`ObjectCode`. - - Loads the module library with specified module code and JIT options. - - Note - ---- - Usage under CUDA 11.x will only load to the current device - context. - - Parameters - ---------- - module : Union[bytes, str] - Either a bytes object containing the module to load, or - a file path string containing that module for loading. - code_type : Any - String of the compiled type. - Supported options are "ptx", "cubin" and "fatbin". - jit_options : Optional - Mapping of JIT options to use during module loading. - (Default to no options) - symbol_mapping : Optional - Keyword argument dictionary specifying how symbol names - should be mapped before trying to retrieve them. - (Default to no mappings) - - Returns - ------- - :obj:`ObjectCode` - Newly created :obj:`ObjectCode`. - - """ if code_type not in self._supported_code_type: raise ValueError _lazy_init() diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 9123fdee..a59324f8 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -8,12 +8,19 @@ class Program: - """Represents the compilation machinery for processing programs into :obj:`ObjectCode`. + """Represent a compilation machinery to process programs into :obj:`ObjectCode`. This object provides a unified interface to multiple underlying compiler libraries. Compilation support is enabled for a wide range of code types and compilation types. + Parameters + ---------- + code : Any + String of the CUDA Runtime Compilation program. + code_type : Any + String of the code type. Only "c++" is currently supported. + """ __slots__ = ("_handle", "_backend", ) @@ -21,21 +28,6 @@ class Program: _supported_target_type = ("ptx", "cubin", "ltoir", ) def __init__(self, code, code_type): - """Create an instance of a :obj:`Program` object. - - Parameters - ---------- - code : Any - String of the CUDA Runtime Compilation program. - code_type : Any - String of the code type. Only "c++" is currently supported. - - Returns - ------- - :obj:`Program` - Newly created program object. - - """ self._handle = None if code_type not in self._supported_code_type: raise NotImplementedError diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 01f4fd1f..7f50dafd 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -36,7 +36,7 @@ class StreamOptions: class Stream: - """Represents a queue of GPU operations that are executed in a specific order. + """Represent a queue of GPU operations that are executed in a specific order. Applications use streams to control the order of execution for GPU work. Work within a single stream are executed sequentially. @@ -46,19 +46,17 @@ class Stream: Advanced users can utilize default streams for enforce complex implicit synchronization behaviors. + Directly creating a :obj:`Stream` is not supported due to ambiguity. + New streams should instead be created through a :obj:`Device` + object, or created directly through using an existing handle + using Stream.from_handle(). + """ __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", "_device_id", "_ctx_handle") def __init__(self): - """Unsupported function due to ambiguity. - - New streams should instead be created through a :obj:`Device` - object, or created directly through using an existing handle - using Stream.from_handle() - - """ # minimal requirements for the destructor self._handle = None self._owner = None From 7e256885bfe2b622c8dce79c96ddcda8b891ea1b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 6 Nov 2024 04:26:55 +0000 Subject: [PATCH 13/13] populate API references; clean up & polish docs --- cuda_core/cuda/core/experimental/_launcher.py | 10 +++---- cuda_core/cuda/core/experimental/_module.py | 4 +-- cuda_core/cuda/core/experimental/_program.py | 7 +++-- .../source/_templates/autosummary/class.rst | 26 +++++++++++++++++ .../_templates/autosummary/dataclass.rst | 10 +++++++ .../_templates/autosummary/namedtuple.rst | 8 ++++++ cuda_core/docs/source/api.rst | 13 +++++++++ cuda_core/docs/source/api_private.rst | 28 +++++++++++++++++++ cuda_core/docs/source/conf.py | 6 +++- cuda_core/docs/source/install.md | 20 +++++-------- cuda_core/docs/source/release/0.1.0-notes.md | 16 ++++++----- cuda_python/docs/environment-docs.yml | 1 + 12 files changed, 117 insertions(+), 32 deletions(-) create mode 100644 cuda_core/docs/source/_templates/autosummary/class.rst create mode 100644 cuda_core/docs/source/_templates/autosummary/dataclass.rst create mode 100644 cuda_core/docs/source/_templates/autosummary/namedtuple.rst create mode 100644 cuda_core/docs/source/api_private.rst diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 2787a718..9991638f 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -103,16 +103,14 @@ def _cast_to_3_tuple(self, cfg): def launch(kernel, config, *kernel_args): - """Launches a :obj:`Kernel` object with launch-time configuration. - - Invokes a :obj:`Kernel` object with specified launch-time - configurations. + """Launches a :obj:`~cuda.core.experimental._module.Kernel` + object with launch-time configuration. Parameters ---------- - kernel : :obj:`Kernel` + kernel : :obj:`~cuda.core.experimental._module.Kernel` Kernel to launch. - config : Any + config : :obj:`LaunchConfig` Launch configurations inline with options provided by :obj:`LaunchConfig` dataclass. *kernel_args : Any diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index b587d813..8b0ff9a7 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -48,8 +48,8 @@ def _lazy_init(): class Kernel: """Represent a compiled kernel that had been loaded onto the device. - Kernel instances can execution when passed directly into a - launch function. + Kernel instances can execution when passed directly into the + :func:`~cuda.core.experimental.launch` function. Directly creating a :obj:`Kernel` is not supported, and they should instead be created through a :obj:`ObjectCode` object. diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index a59324f8..5439c74a 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -8,7 +8,8 @@ class Program: - """Represent a compilation machinery to process programs into :obj:`ObjectCode`. + """Represent a compilation machinery to process programs into + :obj:`~cuda.core.experimental._module.ObjectCode`. This object provides a unified interface to multiple underlying compiler libraries. Compilation support is enabled for a wide @@ -19,7 +20,7 @@ class Program: code : Any String of the CUDA Runtime Compilation program. code_type : Any - String of the code type. Only "c++" is currently supported. + String of the code type. Currently only ``"c++"`` is supported. """ @@ -74,7 +75,7 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): Returns ------- - :obj:`ObjectCode` + :obj:`~cuda.core.experimental._module.ObjectCode` Newly created code object. """ diff --git a/cuda_core/docs/source/_templates/autosummary/class.rst b/cuda_core/docs/source/_templates/autosummary/class.rst new file mode 100644 index 00000000..b45a3fd5 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/class.rst @@ -0,0 +1,26 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + .. rubric:: {{ _('Methods') }} + + {% for item in methods %} + .. automethod:: {{ item }} + {%- endfor %} + + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + {% for item in attributes %} + .. autoattribute:: {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst new file mode 100644 index 00000000..b8c35324 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst @@ -0,0 +1,10 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + .. automethod:: __init__ + {% endblock %} + diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst new file mode 100644 index 00000000..d3ad7d24 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst @@ -0,0 +1,8 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: __new__ + :special-members: __new__ + :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ \ No newline at end of file diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 756ed776..1cb9811b 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -3,6 +3,11 @@ ``cuda.core.experimental`` API Reference ======================================== +All of the APIs listed (or cross-referenced from) below are considered *experimental* +and subject to future changes without deprecation notice. Once stablized they will be +moved out of the ``experimental`` namespace. + + CUDA runtime ------------ @@ -10,6 +15,14 @@ CUDA runtime :toctree: generated/ Device + launch + + :template: dataclass.rst + + EventOptions + StreamOptions + LaunchConfig + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst new file mode 100644 index 00000000..f100eb7c --- /dev/null +++ b/cuda_core/docs/source/api_private.rst @@ -0,0 +1,28 @@ +:orphan: + +.. This page is to generate documentation for private classes exposed to users, + i.e., users cannot instantiate it by themselves but may use it's properties + or methods via returned values from public APIs. These classes must be referred + in public APIs returning their instances. + +.. currentmodule:: cuda.core.experimental + +CUDA runtime +------------ + +.. autosummary:: + :toctree: generated/ + + _memory.Buffer + _stream.Stream + _event.Event + + +CUDA compilation toolchain +-------------------------- + +.. autosummary:: + :toctree: generated/ + + _module.Kernel + _module.ObjectCode diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 4be77656..5b28d331 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -34,7 +34,8 @@ 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', 'myst_nb', - 'enum_tools.autoenum' + 'enum_tools.autoenum', + 'sphinx_copybutton', ] # Add any paths that contain templates here, relative to this directory. @@ -77,3 +78,6 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] + +# skip cmdline prompts +copybutton_exclude = '.linenos, .gp' diff --git a/cuda_core/docs/source/install.md b/cuda_core/docs/source/install.md index 593f7225..e13f37df 100644 --- a/cuda_core/docs/source/install.md +++ b/cuda_core/docs/source/install.md @@ -5,25 +5,19 @@ `cuda.core` is supported on all platforms that CUDA is supported. Specific dependencies are as follows: -* Driver: Linux (450.80.02 or later) Windows (456.38 or later) -* CUDA Toolkit 12.0 to 12.6 +| | CUDA 11 | CUDA 12 | +|------------------ | ------------ | ----------- | +| CUDA Toolkit [^1] | 11.2 - 11.8 | 12.0 - 12.6 | +| Driver | 450.80.02+ (Linux), 452.39+ (Windows) | 525.60.13+ (Linux), 527.41+ (Windows) | - -## Installing from PyPI - -Coming soon! - - -## Installing from Conda - -Coming soon! +[^1]: Including `cuda-python`. ## Installing from Source -```shell +```console $ git clone https://github.com/NVIDIA/cuda-python $ cd cuda-python/cuda_core $ pip install . ``` -For now `cuda-python` (`cuda-bindings` later) is a required dependency. +For now `cuda-python` (`cuda-bindings` later) 11.x or 12.x is a required dependency. diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 2247e0cb..1ebb41f9 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,15 +1,17 @@ -# ``cuda.core`` Release notes +# `cuda.core` Release notes -Released on Oct XX, 2024 +Released on Nov XX, 2024 ## Hightlights -- Initial beta 1 release +- Initial EA1 (early access) release - Supports all platforms that CUDA is supported -- Supports all CUDA 12.x drivers -- Supports all CUDA 12.x Toolkits +- Supports all CUDA 11.x/12.x drivers +- Supports all CUDA 11.x/12.x Toolkits - Pythonic CUDA runtime and other core functionalities ## Limitations -- Source code release only; Python packages coming in a future release -- Support for CUDA 11.x coming in the next release +- All APIs are currently *experimental* and subject to change without deprecation notice. + Please kindly share your feedbacks with us so that we can make `cuda.core` better! +- Source code release only; `pip`/`conda` support is coming in a future release +- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) diff --git a/cuda_python/docs/environment-docs.yml b/cuda_python/docs/environment-docs.yml index 2a3a8ad3..bc9588fd 100644 --- a/cuda_python/docs/environment-docs.yml +++ b/cuda_python/docs/environment-docs.yml @@ -11,6 +11,7 @@ dependencies: - pytest - scipy - sphinx + - sphinx-copybutton - pip: - furo - myst-nb