From 6397752fe90493abd324981d9551383184ebb827 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 23 Oct 2024 14:04:00 -0700
Subject: [PATCH 01/13] Add docstrings to cuda.core for Device class

Use NumPy Style Python Docstrings to maintain consistency with
cuda-bindings.
---
 cuda_core/cuda/core/experimental/_device.py | 172 ++++++++++++++++++--
 1 file changed, 155 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 65d5fe9b..7e321b20 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -19,10 +19,37 @@
 
 
 class Device:
+    """Represents a GPU and acts as an entry point for cuda.core features.
 
+    This is a singleton object that helps ensure interoperability
+    across multiple libraries imported in the process to both see
+    and use the same GPU device.
+
+    While acting as the entry point, many other CUDA resources can be
+    allocated such as streams and buffers. Any :obj:`Context` dependent
+    resource created through this device, will continue to refer to
+    this device's context.
+
+    """
     __slots__ = ("_id", "_mr", "_has_inited")
 
     def __new__(cls, device_id=None):
+        """Create and return a singleton :obj:`Device` object.
+
+        Creates and returns a thread-local singleton :obj:`Device` object
+        corresponding to a specific device.
+
+        Note
+        ----
+        Will not initialize the GPU.
+
+        Parameters
+        ----------
+        device_id : int, optional
+            Device ordinal to return a :obj:`Device` object for.
+            Default value of `None` return the currently used device.
+
+        """
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(cudart.cudaGetDevice())
@@ -54,15 +81,24 @@ def _check_context_initialized(self, *args, **kwargs):
 
     @property
     def device_id(self) -> int:
+        """Returns device ordinal."""
         return self._id
 
     @property
     def pci_bus_id(self) -> str:
+        """Returns a PCI Bus Id string for this device."""
         bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
         return bus_id[:12].decode()
 
     @property
     def uuid(self) -> str:
+        """Returns a UUID for the device.
+
+        Returns 16-octets identifying the device. If the device is in
+        MIG mode, returns its MIG UUID which uniquely identifies the
+        subscribed MIG compute instance.
+
+        """
         driver_ver = handle_return(cuda.cuDriverGetVersion())
         if driver_ver >= 11040:
             uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
@@ -74,19 +110,21 @@ def uuid(self) -> str:
 
     @property
     def name(self) -> str:
-        # assuming a GPU name is less than 128 characters...
-        name = handle_return(cuda.cuDeviceGetName(128, self._id))
+        """Returns the device name."""
+        # CUDA Runtime uses up to 256 characters, use the same for consistency
+        name = handle_return(cuda.cuDeviceGetName(256, self._id))
         name = name.split(b'\0')[0]
         return name.decode()
 
     @property
     def properties(self) -> dict:
+        """Returns information about the compute-device."""
         # TODO: pythonize the key names
         return handle_return(cudart.cudaGetDeviceProperties(self._id))
 
     @property
     def compute_capability(self) -> ComputeCapability:
-        """Returns a named tuple with 2 fields: major and minor. """
+        """Returns a named tuple with 2 fields: major and minor."""
         major = handle_return(cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
         minor = handle_return(cudart.cudaDeviceGetAttribute(
@@ -96,12 +134,20 @@ def compute_capability(self) -> ComputeCapability:
     @property
     @precondition(_check_context_initialized)
     def context(self) -> Context:
+        """Returns the current :obj:`Context` associated with this device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
         ctx = handle_return(cuda.cuCtxGetCurrent())
         assert int(ctx) != 0
         return Context._from_ctx(ctx, self._id)
 
     @property
     def memory_resource(self) -> MemoryResource:
+        """Returns :obj:`MemoryResource` associated with this device."""
         return self._mr
 
     @memory_resource.setter
@@ -112,27 +158,50 @@ def memory_resource(self, mr):
 
     @property
     def default_stream(self) -> Stream:
+        """Returns default CUDA :obj:`Stream` associated with this device.
+
+        Returns per-thread default stream if environment is set with
+        CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream.
+
+        """
         return default_stream()
 
     def __int__(self):
+        """Returns device_id."""
         return self._id
 
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
     def set_current(self, ctx: Context=None) -> Union[Context, None]:
-        """
-        Entry point of this object. Users always start a code by
+        """Set device to be used for GPU executions.
+
+        Initializes CUDA and sets the calling thread to a valid CUDA
+        context. By default the primary context is used, but optional `ctx`
+        parameter can be used to explicitly supply a :obj:`Context` object.
+
+        Providing a `ctx` causes the previous set context to be popped and returned.
+
+        Parameters
+        ----------
+        ctx : Context, optional
+            Optional context to push onto this device's current thread stack
+
+        Returns
+        -------
+        Union[Context, None], optional
+            Popped context
+
+        Examples
+        --------
+        Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
-        
+
         >>> from cuda.core.experimental import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...
-        
-        The optional ctx argument is for advanced users to bind a
-        CUDA context with the device. In this case, the previously
-        set context is popped and returned to the user.
+
         """
         if ctx is not None:
             if not isinstance(ctx, Context):
@@ -163,25 +232,94 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]:
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
-        # Create a Context object (but do NOT set it current yet!).
-        # ContextOptions is a dataclass for setting e.g. affinity or CIG
-        # options. 
+        """Create a new :obj:`Context` object.
+
+        Note
+        ----
+        The newly context will not be set as current.
+
+        Parameters
+        ----------
+        options : ContextOptions, optional
+            Customizable dataclass for context creation options
+
+        Returns
+        -------
+        :obj:`Context`
+            Newly created Context object
+
+        """
         raise NotImplementedError("TODO")
 
     @precondition(_check_context_initialized)
     def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
-        # Create a Stream object by either holding a newly created
-        # CUDA stream or wrapping an existing foreign object supporting
-        # the __cuda_stream__ protocol. In the latter case, a reference
-        # to obj is held internally so that its lifetime is managed.
+        """Create a Stream object.
+
+        New stream objects can be created in two different ways:
+
+        1) Create a new CUDA stream with customizable `options`.
+        2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol.
+
+        Option (2) internally holds a reference to the foreign object
+        such that the lifetime is managed.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        obj : Any, optional
+            Any object supporting the __cuda_stream__ protocol.
+        options : :obj:`StreamOptions`, optional
+            Customizable dataclass for stream creation options
+
+        Returns
+        -------
+        :obj:`Stream`
+            Newly created Stream object
+
+        """
         return Stream._init(obj=obj, options=options)
 
     @precondition(_check_context_initialized)
     def allocate(self, size, stream=None) -> Buffer:
+        """Allocate device memory from a specified stream.
+
+        Allocates device memory of `size` bytes on the specified `stream`
+        using the memory resource currently associated with this Device.
+
+        Parameter `stream` is optional, using a default stream by default.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        size : int
+            Number of bytes to allocate.
+        stream : :obj:`Stream`, optional
+            The stream establishing the stream ordering semantic.
+            Default value of `None` uses default stream.
+
+        Returns
+        -------
+        :obj:`Buffer`
+            Newly created Buffer object
+
+        """
         if stream is None:
             stream = default_stream()
         return self._mr.allocate(size, stream)
 
     @precondition(_check_context_initialized)
     def sync(self):
+        """Synchronize the device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
         handle_return(cudart.cudaDeviceSynchronize())

From 9e9816084b2db9659d06fdae8035c57343caee8c Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 23 Oct 2024 14:06:32 -0700
Subject: [PATCH 02/13] Use "Return" wording to be consistent with default

---
 cuda_core/cuda/core/experimental/_device.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 7e321b20..9380c80b 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -81,18 +81,18 @@ def _check_context_initialized(self, *args, **kwargs):
 
     @property
     def device_id(self) -> int:
-        """Returns device ordinal."""
+        """Return device ordinal."""
         return self._id
 
     @property
     def pci_bus_id(self) -> str:
-        """Returns a PCI Bus Id string for this device."""
+        """Return a PCI Bus Id string for this device."""
         bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
         return bus_id[:12].decode()
 
     @property
     def uuid(self) -> str:
-        """Returns a UUID for the device.
+        """Return a UUID for the device.
 
         Returns 16-octets identifying the device. If the device is in
         MIG mode, returns its MIG UUID which uniquely identifies the
@@ -110,7 +110,7 @@ def uuid(self) -> str:
 
     @property
     def name(self) -> str:
-        """Returns the device name."""
+        """Return the device name."""
         # CUDA Runtime uses up to 256 characters, use the same for consistency
         name = handle_return(cuda.cuDeviceGetName(256, self._id))
         name = name.split(b'\0')[0]
@@ -118,13 +118,13 @@ def name(self) -> str:
 
     @property
     def properties(self) -> dict:
-        """Returns information about the compute-device."""
+        """Return information about the compute-device."""
         # TODO: pythonize the key names
         return handle_return(cudart.cudaGetDeviceProperties(self._id))
 
     @property
     def compute_capability(self) -> ComputeCapability:
-        """Returns a named tuple with 2 fields: major and minor."""
+        """Return a named tuple with 2 fields: major and minor."""
         major = handle_return(cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
         minor = handle_return(cudart.cudaDeviceGetAttribute(
@@ -134,7 +134,7 @@ def compute_capability(self) -> ComputeCapability:
     @property
     @precondition(_check_context_initialized)
     def context(self) -> Context:
-        """Returns the current :obj:`Context` associated with this device.
+        """Return the current :obj:`Context` associated with this device.
 
         Note
         ----
@@ -147,7 +147,7 @@ def context(self) -> Context:
 
     @property
     def memory_resource(self) -> MemoryResource:
-        """Returns :obj:`MemoryResource` associated with this device."""
+        """Return :obj:`MemoryResource` associated with this device."""
         return self._mr
 
     @memory_resource.setter
@@ -158,7 +158,7 @@ def memory_resource(self, mr):
 
     @property
     def default_stream(self) -> Stream:
-        """Returns default CUDA :obj:`Stream` associated with this device.
+        """Return default CUDA :obj:`Stream` associated with this device.
 
         Returns per-thread default stream if environment is set with
         CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream.
@@ -167,7 +167,7 @@ def default_stream(self) -> Stream:
         return default_stream()
 
     def __int__(self):
-        """Returns device_id."""
+        """Return device_id."""
         return self._id
 
     def __repr__(self):

From f73876d9ebc23ca930bc687089a76ad5d3174ab6 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 23 Oct 2024 17:09:41 -0700
Subject: [PATCH 03/13] Add docstrings to cuda.core for Event and EventOptions
 class

---
 cuda_core/cuda/core/experimental/_event.py | 56 ++++++++++++++++++----
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 3c85d9fe..36c76558 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -13,17 +13,48 @@
 
 @dataclass
 class EventOptions:
+    """Customizable :obj:`Event` options.
+
+    Attributes
+    ----------
+    enable_timing : bool, optional
+        Event will record timing data. (Default to False)
+    busy_waited_sync : bool, optional
+        If True, event will use blocking synchronization. When a CPU
+        thread calls synchronize, the call will block until the event
+        has actually been completed.
+        Otherwise, the CPU thread will busy-wait until the event has
+        been completed. (Default to False)
+    support_ipc : bool, optional
+        Event will be suitable for interprocess use.
+        Note that enable_timing must be False. (Default to False)
+
+    """
     enable_timing: Optional[bool] = False
     busy_waited_sync: Optional[bool] = False
     support_ipc: Optional[bool] = False
 
 
 class Event:
+    """Represents a record of a specific point of execution within a CUDA stream.
 
+    Applications can asynchronously record events at any point in
+    the program. An event keeps a record of all previous work withinq
+    the last recorded stream.
+
+    Events can be used to monitor device's progress, query completion
+    of work up to event's record, and help establish dependencies
+    between GPU work submissions.
+
+    """
     __slots__ = ("_handle", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
-        # minimal requirements for the destructor
+        """Unsupported function due to ambiguity.
+
+        New events should instead be created through a :obj:`Stream` object.
+
+        """
         self._handle = None
         raise NotImplementedError(
             "directly creating an Event object can be ambiguous. Please call "
@@ -51,37 +82,45 @@ def _init(options: Optional[EventOptions]=None):
         return self
 
     def __del__(self):
+        """Destroys the event."""
         self.close()
 
     def close(self):
-        # Destroy the event.
+        """Destroys the event."""
         if self._handle:
             handle_return(cuda.cuEventDestroy(self._handle))
             self._handle = None
 
     @property
     def is_timing_disabled(self) -> bool:
-        # Check if this instance can be used for the timing purpose.
+        """Return True if the event does not record timing data, otherwise False."""
         return self._timing_disabled
 
     @property
     def is_sync_busy_waited(self) -> bool:
-        # Check if the event synchronization would keep the CPU busy-waiting.
+        """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False."""
         return self._busy_waited
 
     @property
     def is_ipc_supported(self) -> bool:
-        # Check if this instance can be used for IPC.
+        """Return True if this event can be used as an interprocess event, otherwise False."""
         raise NotImplementedError("TODO")
 
     def sync(self):
-        # Sync over the event.
+        """Synchronize until the event completes.
+
+        If the event was created with busy_waited_sync, then the
+        calling CPU thread will block until the event has been
+        completed by the device.
+        Otherwise the CPU thread will busy-wait until the event
+        has been completed.
+
+        """
         handle_return(cuda.cuEventSynchronize(self._handle))
 
     @property
     def is_done(self) -> bool:
-        # Return True if all captured works have been completed,
-        # otherwise False.
+        """Return True if all captured works have been completed, otherwise False."""
         result, = cuda.cuEventQuery(self._handle)
         if result == cuda.CUresult.CUDA_SUCCESS:
             return True
@@ -92,4 +131,5 @@ def is_done(self) -> bool:
 
     @property
     def handle(self) -> int:
+        """Return event memory address."""
         return int(self._handle)

From e739e92cd022b83498b662632c41e6799134afc0 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 23 Oct 2024 17:12:04 -0700
Subject: [PATCH 04/13] Fix typo

---
 cuda_core/cuda/core/experimental/_event.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 36c76558..83b0dfdb 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -39,7 +39,7 @@ class Event:
     """Represents a record of a specific point of execution within a CUDA stream.
 
     Applications can asynchronously record events at any point in
-    the program. An event keeps a record of all previous work withinq
+    the program. An event keeps a record of all previous work within
     the last recorded stream.
 
     Events can be used to monitor device's progress, query completion

From 56efb3988506a96fb6ff105de8096c8d840e8e56 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Mon, 28 Oct 2024 15:20:13 -0700
Subject: [PATCH 05/13] Push saved work for device event and stream

---
 cuda_core/cuda/core/experimental/_device.py |  27 +++--
 cuda_core/cuda/core/experimental/_event.py  |   4 +-
 cuda_core/cuda/core/experimental/_stream.py | 127 +++++++++++++++++---
 3 files changed, 127 insertions(+), 31 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 9380c80b..f91d3457 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -160,8 +160,11 @@ def memory_resource(self, mr):
     def default_stream(self) -> Stream:
         """Return default CUDA :obj:`Stream` associated with this device.
 
-        Returns per-thread default stream if environment is set with
-        CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM, otherwise return a legacy stream.
+        The type of default stream returned depends on if the environment
+        variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+        If set, returns a per-thread default stream. Otherwise returns
+        the legacy stream.
 
         """
         return default_stream()
@@ -184,13 +187,13 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]:
 
         Parameters
         ----------
-        ctx : Context, optional
-            Optional context to push onto this device's current thread stack
+        ctx : :obj:`Context`, optional
+            Optional context to push onto this device's current thread stack.
 
         Returns
         -------
-        Union[Context, None], optional
-            Popped context
+        Union[:obj:`Context`, None], optional
+            Popped context.
 
         Examples
         --------
@@ -240,13 +243,13 @@ def create_context(self, options: ContextOptions = None) -> Context:
 
         Parameters
         ----------
-        options : ContextOptions, optional
-            Customizable dataclass for context creation options
+        options : :obj:`ContextOptions`, optional
+            Customizable dataclass for context creation options.
 
         Returns
         -------
         :obj:`Context`
-            Newly created Context object
+            Newly created Context object.
 
         """
         raise NotImplementedError("TODO")
@@ -272,12 +275,12 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
         obj : Any, optional
             Any object supporting the __cuda_stream__ protocol.
         options : :obj:`StreamOptions`, optional
-            Customizable dataclass for stream creation options
+            Customizable dataclass for stream creation options.
 
         Returns
         -------
         :obj:`Stream`
-            Newly created Stream object
+            Newly created stream object.
 
         """
         return Stream._init(obj=obj, options=options)
@@ -306,7 +309,7 @@ def allocate(self, size, stream=None) -> Buffer:
         Returns
         -------
         :obj:`Buffer`
-            Newly created Buffer object
+            Newly created Buffer object.
 
         """
         if stream is None:
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 83b0dfdb..4805e093 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -82,7 +82,7 @@ def _init(options: Optional[EventOptions]=None):
         return self
 
     def __del__(self):
-        """Destroys the event."""
+        """Return close(self)"""
         self.close()
 
     def close(self):
@@ -131,5 +131,5 @@ def is_done(self) -> bool:
 
     @property
     def handle(self) -> int:
-        """Return event memory address."""
+        """Return the underlying cudaEvent_t pointer address as Python int."""
         return int(self._handle)
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 95f8ec50..e915f01f 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -20,17 +20,45 @@
 
 @dataclass
 class StreamOptions:
+    """Customizable :obj:`Stream` options.
 
+    Attributes
+    ----------
+    nonblocking : bool, optional
+        Stream does not synchronize with the NULL stream. (Default to True)
+    priority : int, optional
+        Stream priority where lower number represents a
+        higher priority. (Default to highest priority)
+
+    """
     nonblocking: bool = True
     priority: Optional[int] = None
 
 
 class Stream:
+    """Represents a queue of GPU operations that are executed in a specific order.
+
+    Applications use streams to control the order of execution for
+    GPU work. Work within a single stream are executed sequentially.
+    Whereas work across multiple streams can be further controlled
+    using stream priorities and :obj:`Event` managements.
+
+    Advanced users can utilize default streams for enforce complex
+    implicit synchronization behaviors.
+
+    """
 
     __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin",
                  "_device_id", "_ctx_handle")
 
     def __init__(self):
+        """Unsupported function due to ambiguity.
+
+        New streams should instead be created through a :obj:`Device`
+        object, or created directly through using an existing handle
+        using Stream.from_handle()
+
+        """
         # minimal requirements for the destructor
         self._handle = None
         self._owner = None
@@ -74,13 +102,12 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         else:
             flags = cuda.CUstream_flags.CU_STREAM_DEFAULT
 
+        high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange())
         if priority is not None:
-            high, low = handle_return(
-                cudart.cudaDeviceGetStreamPriorityRange())
             if not (low <= priority <= high):
                 raise ValueError(f"{priority=} is out of range {[low, high]}")
         else:
-            priority = 0
+            priority = high
 
         self._handle = handle_return(
             cuda.cuStreamCreateWithPriority(flags, priority))
@@ -94,9 +121,16 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         return self
 
     def __del__(self):
+        """Return close(self)."""
         self.close()
 
     def close(self):
+        """Destroy the stream.
+
+        Destroys the stream if we own it. Borrowed foreign stream
+        object will instead have their references released.
+
+        """
         if self._owner is None:
             if self._handle and not self._builtin:
                 handle_return(cuda.cuStreamDestroy(self._handle))
@@ -106,15 +140,17 @@ def close(self):
 
     @property
     def __cuda_stream__(self) -> Tuple[int, int]:
+        """Return an instance of a __cuda_stream__ protocol."""
         return (0, int(self._handle))
 
     @property
     def handle(self) -> int:
-        # Return the underlying cudaStream_t pointer address as Python int.
+        """Return the underlying cudaStream_t pointer address as Python int."""
         return int(self._handle)
 
     @property
     def is_nonblocking(self) -> bool:
+        """Return True if this is a nonblocking stream, otherwise False."""
         if self._nonblocking is None:
             flag = handle_return(cuda.cuStreamGetFlags(self._handle))
             if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING:
@@ -125,15 +161,35 @@ def is_nonblocking(self) -> bool:
 
     @property
     def priority(self) -> int:
+        """Return the stream priority."""
         if self._priority is None:
             prio = handle_return(cuda.cuStreamGetPriority(self._handle))
             self._priority = prio
         return self._priority
 
     def sync(self):
+        """Synchronize the stream."""
         handle_return(cuda.cuStreamSynchronize(self._handle))
 
     def record(self, event: Event=None, options: EventOptions=None) -> Event:
+        """Record an event onto the stream.
+
+        Creates an Event object (or reuses the given one) by
+        recording on the stream.
+
+        Parameters
+        ----------
+        event : :obj:`Event`, optional
+            Optional event object to be reused for recording.
+        options : :obj:`EventOptions`, optional
+            Customizable dataclass for event creation options.
+
+        Returns
+        -------
+        :obj:`Event`
+            Newly created event object.
+
+        """
         # Create an Event object (or reusing the given one) by recording
         # on the stream. Event flags such as disabling timing, nonblocking,
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
@@ -145,11 +201,15 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event:
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
-        # Wait for a CUDA event or a CUDA stream to establish a stream order.
-        #
-        # If a Stream instance is provided, the effect is as if an event is
-        # recorded on the given stream, and then self waits on the recorded
-        # event.
+        """Wait for a CUDA event or a CUDA stream.
+
+        Waiting for an event or a stream establishes a stream order.
+
+        If a :obj:`Stream` is provided, then wait until the stream's
+        work is completed. This is done by recording a new :obj:`Event`
+        on the stream and then waiting on it.
+
+        """
         if isinstance(event_or_stream, Event):
             event = event_or_stream.handle
             discard_event = False
@@ -175,12 +235,15 @@ def wait(self, event_or_stream: Union[Event, Stream]):
 
     @property
     def device(self) -> Device:
-        # Inverse look-up to find on which device this stream instance was
-        # created.
-        #
-        # Note that Stream.device.context might not necessarily agree with
-        # Stream.context, in cases where a different CUDA context is set
-        # current after a stream was created.
+        """Return :obj:`Device` singleton associated with this stream.
+
+        Note
+        ----
+        The current context on the device may differ from this
+        stream's context. This case occurs when a different CUDA
+        context is set current after a stream is created.
+
+        """
         from cuda.core.experimental._device import Device  # avoid circular import
         if self._device_id is None:
             # Get the stream context first
@@ -192,8 +255,7 @@ def device(self) -> Device:
 
     @property
     def context(self) -> Context:
-        # Inverse look-up to find in which CUDA context this stream instance
-        # was created
+        """Return the :obj:`Context` associated with this stream."""
         if self._ctx_handle is None:
             self._ctx_handle = handle_return(
                 cuda.cuStreamGetCtx(self._handle))
@@ -203,6 +265,28 @@ def context(self) -> Context:
 
     @staticmethod
     def from_handle(handle: int) -> Stream:
+        """Create a new :obj:`Stream` object from a foreign stream handle.
+
+        Uses a cudaStream_t pointer address represented as a Python int
+        to create a new :obj:`Stream` object.
+
+        Note
+        ----
+        Stream lifetime is not managed, foreign object must remain
+        alive while this steam is active.
+
+        Parameters
+        ----------
+        handle : int
+            Stream handle representing the address of a foreign
+            stream object.
+
+        Returns
+        -------
+        :obj:`Stream`
+            Newly created stream object.
+
+        """
         class _stream_holder:
             @property
             def __cuda_stream__(self):
@@ -235,6 +319,15 @@ def __init__(self):
 
 
 def default_stream():
+    """Return the default CUDA :obj:`Stream`.
+
+    The type of default stream returned depends on if the environment
+    variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+    If set, returns a per-thread default stream. Otherwise returns
+    the legacy stream.
+
+    """
     # TODO: flip the default
     use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0))
     if use_ptds:

From b5ec7ba225feb6846acb2e5731ffb11b52d5a175 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 29 Oct 2024 08:33:38 -0700
Subject: [PATCH 06/13] Correct wording of what default priority is

---
 cuda_core/cuda/core/experimental/_stream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index e915f01f..c607adea 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -28,7 +28,7 @@ class StreamOptions:
         Stream does not synchronize with the NULL stream. (Default to True)
     priority : int, optional
         Stream priority where lower number represents a
-        higher priority. (Default to highest priority)
+        higher priority. (Default to lowest priority)
 
     """
     nonblocking: bool = True

From 6b4b6e0e49154e5f91143419d3d3760f11bbb1c4 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 29 Oct 2024 10:32:03 -0700
Subject: [PATCH 07/13] Add docstrings to Buffer

---
 cuda_core/cuda/core/experimental/_memory.py | 82 ++++++++++++++++++---
 1 file changed, 71 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 4ef2cbc3..99d1e129 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -22,19 +22,56 @@
 
 
 class Buffer:
+    """Represents a handle to allocated memory.
+
+    This generic object provides a unified representation for how
+    different memory resources are to give access to their memory
+    allocations.
+
+    Support for data interchange mechanisms are provided by
+    establishing both the DLPack and the Python-level buffer
+    protocols.
+
+    """
 
     # TODO: handle ownership? (_mr could be None)
     __slots__ = ("_ptr", "_size", "_mr",)
 
     def __init__(self, ptr, size, mr: MemoryResource=None):
+        """Initialize a new buffer object.
+
+        Parameters
+        ----------
+        ptr : Any
+            Allocated buffer handle object
+        size : Any
+            Memory size of the buffer
+        mr : :obj:`MemoryResource`, optional
+            Memory resource associated with the buffer
+
+        """
         self._ptr = ptr
         self._size = size
         self._mr = mr
 
     def __del__(self):
-        self.close(default_stream())
+        """Return close(self)."""
+        self.close()
 
     def close(self, stream=None):
+        """Deallocate this buffer asynchronously on the given stream.
+
+        This buffer is released back to their memory resource
+        asynchronously on the given stream.
+
+        Parameters
+        ----------
+        stream : Any, optional
+            The stream object with a __cuda_stream__ protocol to
+            use for asynchronous deallocation. Defaults to using
+            the default stream.
+
+        """
         if self._ptr and self._mr is not None:
             if stream is None:
                 stream = default_stream()
@@ -44,42 +81,56 @@ def close(self, stream=None):
 
     @property
     def handle(self):
+        """Return buffer handle object."""
         return self._ptr
 
     @property
     def size(self):
+        """Return the memory size of this buffer."""
         return self._size
 
     @property
     def memory_resource(self) -> MemoryResource:
-        # Return the memory resource from which this buffer was allocated.
+        """Return the memory resource associated with this buffer."""
         return self._mr
 
     @property
     def is_device_accessible(self) -> bool:
-        # Check if this buffer can be accessed from GPUs.
+        """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._mr is not None:
             return self._mr.is_device_accessible
         raise NotImplementedError
 
     @property
     def is_host_accessible(self) -> bool:
-        # Check if this buffer can be accessed from CPUs.
+        """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._mr is not None:
             return self._mr.is_host_accessible
         raise NotImplementedError
 
     @property
     def device_id(self) -> int:
+        """Return the device ordinal of this buffer."""
         if self._mr is not None:
             return self._mr.device_id
         raise NotImplementedError
 
     def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
-        # Copy from this buffer to the dst buffer asynchronously on the
-        # given stream. The dst buffer is returned. If the dst is not provided,
-        # allocate one from self.memory_resource. Raise an exception if the
-        # stream is not provided.
+        """Copy from this buffer to the dst buffer asynchronously on the given stream.
+
+        Copies the data from this buffer to the provided dst buffer.
+        If the dst buffer is not provided, then a new buffer is first
+        allocated using the associated memory resource before the copy.
+
+        Parameters
+        ----------
+        dst : :obj:`Buffer`
+            Source buffer to copy data from
+        stream : Any
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
         if stream is None:
             raise ValueError("stream must be provided")
         if dst is None:
@@ -93,8 +144,17 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
         return dst
 
     def copy_from(self, src: Buffer, *, stream):
-        # Copy from the src buffer to this buffer asynchronously on the
-        # given stream. Raise an exception if the stream is not provided. 
+        """Copy from the src buffer to this buffer asynchronously on the given stream.
+
+        Parameters
+        ----------
+        src : :obj:`Buffer`
+            Source buffer to copy data from
+        stream : Any
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
         if stream is None:
             raise ValueError("stream must be provided")
         if src._size != self._size:
@@ -141,7 +201,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
         raise NotImplementedError("TODO")
 
     def __release_buffer__(self, buffer: memoryview, /):
-        # Supporting methond paired with __buffer__.
+        # Supporting method paired with __buffer__.
         raise NotImplementedError("TODO")
 
 

From 23c194168fad141e5649941b685e2d93decbb9ee Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 29 Oct 2024 10:45:47 -0700
Subject: [PATCH 08/13] Add comment about MIG UUID on old CUDA versions

---
 cuda_core/cuda/core/experimental/_device.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index f91d3457..8e8f7e31 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -98,6 +98,11 @@ def uuid(self) -> str:
         MIG mode, returns its MIG UUID which uniquely identifies the
         subscribed MIG compute instance.
 
+        Note
+        ----
+        MIG UUID is only returned when device is in MIG mode and the
+        driver is older than CUDA 11.4.
+
         """
         driver_ver = handle_return(cuda.cuDriverGetVersion())
         if driver_ver >= 11040:

From b411fa6dbeb167bc91fe54c2119957062d0a4522 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 30 Oct 2024 14:55:27 -0700
Subject: [PATCH 09/13] Add docstring to more classes and functions

Classes:
- Program
- ObjectCode
- Kernel
- LaunchConfig

Functions:
- launch
---
 cuda_core/cuda/core/experimental/_device.py   |  4 +-
 cuda_core/cuda/core/experimental/_launcher.py | 33 ++++++++++-
 cuda_core/cuda/core/experimental/_memory.py   |  2 +-
 cuda_core/cuda/core/experimental/_module.py   | 57 +++++++++++++++++++
 cuda_core/cuda/core/experimental/_program.py  | 50 ++++++++++++++++
 cuda_core/cuda/core/experimental/_stream.py   |  2 +-
 6 files changed, 143 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 8e8f7e31..785f2ed1 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -254,7 +254,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
         Returns
         -------
         :obj:`Context`
-            Newly created Context object.
+            Newly created context object.
 
         """
         raise NotImplementedError("TODO")
@@ -314,7 +314,7 @@ def allocate(self, size, stream=None) -> Buffer:
         Returns
         -------
         :obj:`Buffer`
-            Newly created Buffer object.
+            Newly created buffer object.
 
         """
         if stream is None:
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index 4b9533cb..a3250227 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -37,7 +37,23 @@ def _lazy_init():
 
 @dataclass
 class LaunchConfig:
-    """
+    """Customizable launch options.
+
+    Attributes
+    ----------
+    grid : Union[tuple, int]
+        Collection of threads that will execute a kernel function.
+    block : Union[tuple, int]
+        Group of threads (Thread Block) that will execute on the same
+        multiprocessor. Threads within a thread blocks have access to
+        shared memory and can be explicitly synchronized.
+    stream : :obj:`Stream`
+        The stream establishing the stream ordering semantic of a
+        launch.
+    shmem_size : int, optional
+        Dynamic shared-memory size per thread block in bytes.
+        (Default to size 0)
+
     """
     # TODO: expand LaunchConfig to include other attributes
     grid: Union[tuple, int] = None
@@ -87,6 +103,21 @@ def _cast_to_3_tuple(self, cfg):
 
 
 def launch(kernel, config, *kernel_args):
+    """Launches a :obj:`Kernel` object with launch-time configuration.
+
+    Invokes a :obj:`Kernel` object with specified launch-time
+    configurations.
+
+    Parameters
+    ----------
+    config : Any
+        Launch configurations inline with options provided by
+        :obj:`LaunchConfig` dataclass.
+    *kernel_args : Any
+        Variable length argument list that is provided to the
+        launching kernel.
+
+    """
     if not isinstance(kernel, Kernel):
         raise ValueError
     config = check_or_create_options(LaunchConfig, config, "launch config")
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 99d1e129..3eeea660 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -81,7 +81,7 @@ def close(self, stream=None):
 
     @property
     def handle(self):
-        """Return buffer handle object."""
+        """Return the buffer handle object."""
         return self._ptr
 
     @property
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 60d4db97..0b9e450c 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -46,10 +46,17 @@ def _lazy_init():
 
 
 class Kernel:
+    """Represents a compiled kernel that had been loaded onto the device.
+
+    Kernel instances can execution when passed directly into a
+    launch function.
+
+    """
 
     __slots__ = ("_handle", "_module",)
 
     def __init__(self):
+        """Unsupported function whose creation is intended through an :obj:`ObjectCode` object."""
         raise NotImplementedError("directly constructing a Kernel instance is not supported")
 
     @staticmethod
@@ -65,12 +72,49 @@ def _from_obj(obj, mod):
 
 
 class ObjectCode:
+    """Represents the compiled program loaded onto the device.
+
+    This object provides a unified interface for different types of
+    compiled programs that are loaded onto the device.
+
+    """
 
     __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "fatbin")
 
     def __init__(self, module, code_type, jit_options=None, *,
                  symbol_mapping=None):
+        """Create and return a compiled program as an instance of an :obj:`ObjectCode`.
+
+        Loads the module library with specified module code and JIT options.
+
+        Note
+        ----
+        Usage under CUDA 11.x will only load to the current device
+        context.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the module to load, or
+            a file path string containing that module for loading.
+        code_type : Any
+            String of the compiled type.
+            Supported options are "ptx", "cubin" and "ltoir".
+        jit_options : Optional
+            Mapping of JIT options to use during module loading.
+            (Default to no options)
+        symbol_mapping : Optional
+            Keyword argument dictionary specifying how symbol names
+            should be mapped before trying to retrieve them.
+            (Default to no mappings)
+
+        Returns
+        -------
+        :obj:`ObjectCode`
+            Newly created :obj:`ObjectCode`.
+
+        """
         if code_type not in self._supported_code_type:
             raise ValueError
         _lazy_init()
@@ -107,6 +151,19 @@ def __del__(self):
         pass
 
     def get_kernel(self, name):
+        """Return the :obj:`Kernel` of a specified name from this object code.
+
+        Parameters
+        ----------
+        name : Any
+            Name of the kernel to retrieve.
+
+        Returns
+        -------
+        :obj:`Kernel`
+            Newly created kernel object.
+
+        """
         try:
             name = self._sym_map[name]
         except KeyError:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index ec0778a3..2f4f974f 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -8,12 +8,34 @@
 
 
 class Program:
+    """Represents the compilation machinery for processing programs into :obj:`ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
+
+    """
 
     __slots__ = ("_handle", "_backend", )
     _supported_code_type = ("c++", )
     _supported_target_type = ("ptx", "cubin", "ltoir", )
 
     def __init__(self, code, code_type):
+        """Create an instance of a :obj:`Program` object.
+
+        Parameters
+        ----------
+        code : Any
+            String of the CUDA Runtime Compilation program.
+        code_type : Any
+            String of the code type. Only "c++" is currently supported.
+
+        Returns
+        -------
+        :obj:`Program`
+            Newly created program object.
+
+        """
         self._handle = None
         if code_type not in self._supported_code_type:
             raise NotImplementedError
@@ -30,14 +52,40 @@ def __init__(self, code, code_type):
             raise NotImplementedError
 
     def __del__(self):
+        """Return close(self)."""
         self.close()
 
     def close(self):
+        """Destroys this program."""
         if self._handle is not None:
             handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
             self._handle = None
 
     def compile(self, target_type, options=(), name_expressions=(), logs=None):
+        """Compile the program with a specific compilation type.
+
+        Parameters
+        ----------
+        target_type : Any
+            String of the targeted compilation type.
+            Supported options are "ptx", "cubin" and "ltoir".
+        options : Union[List, Tuple], optional
+            List of compilation options associated with the backend
+            of this :obj:`Program`. (Default to no options)
+        name_expressions : Union[List, Tuple], optional
+            List of explicit name expressions to become accessible.
+            (Default to no expressions)
+        logs : Any, optional
+            Object with a write method to receive the logs generated
+            from compilation.
+            (Default to no logs)
+
+        Returns
+        -------
+        :obj:`ObjectCode`
+            Newly created code object.
+
+        """
         if target_type not in self._supported_target_type:
             raise NotImplementedError
 
@@ -80,8 +128,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
 
     @property
     def backend(self):
+        """Return the backend type string associated with this program."""
         return self._backend
 
     @property
     def handle(self):
+        """Return the program handle object."""
         return self._handle
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index c607adea..eb4f03a0 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -235,7 +235,7 @@ def wait(self, event_or_stream: Union[Event, Stream]):
 
     @property
     def device(self) -> Device:
-        """Return :obj:`Device` singleton associated with this stream.
+        """Return the :obj:`Device` singleton associated with this stream.
 
         Note
         ----

From 7497a938131979a724deda99b190f6362d71db6d Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 31 Oct 2024 12:47:54 -0700
Subject: [PATCH 10/13] Naming consistency and rewording

---
 cuda_core/cuda/core/experimental/_device.py  | 2 +-
 cuda_core/cuda/core/experimental/_event.py   | 2 +-
 cuda_core/cuda/core/experimental/_program.py | 2 +-
 cuda_core/cuda/core/experimental/_stream.py  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 785f2ed1..7890ae53 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -116,7 +116,7 @@ def uuid(self) -> str:
     @property
     def name(self) -> str:
         """Return the device name."""
-        # CUDA Runtime uses up to 256 characters, use the same for consistency
+        # Use 256 characters to be consistent with CUDA Runtime
         name = handle_return(cuda.cuDeviceGetName(256, self._id))
         name = name.split(b'\0')[0]
         return name.decode()
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 4805e093..a1b03974 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -86,7 +86,7 @@ def __del__(self):
         self.close()
 
     def close(self):
-        """Destroys the event."""
+        """Destroy the event."""
         if self._handle:
             handle_return(cuda.cuEventDestroy(self._handle))
             self._handle = None
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 2f4f974f..9123fdee 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -56,7 +56,7 @@ def __del__(self):
         self.close()
 
     def close(self):
-        """Destroys this program."""
+        """Destroy this program."""
         if self._handle is not None:
             handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
             self._handle = None
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index eb4f03a0..01f4fd1f 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -127,7 +127,7 @@ def __del__(self):
     def close(self):
         """Destroy the stream.
 
-        Destroys the stream if we own it. Borrowed foreign stream
+        Destroy the stream if we own it. Borrowed foreign stream
         object will instead have their references released.
 
         """

From cc3b2fd15d76c2224ad8fc6a696cec977860d2ac Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Fri, 1 Nov 2024 12:42:00 -0700
Subject: [PATCH 11/13] Refer to the correct ObjectCode supported types

---
 cuda_core/cuda/core/experimental/_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 0b9e450c..b7802633 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -100,7 +100,7 @@ def __init__(self, module, code_type, jit_options=None, *,
             a file path string containing that module for loading.
         code_type : Any
             String of the compiled type.
-            Supported options are "ptx", "cubin" and "ltoir".
+            Supported options are "ptx", "cubin" and "fatbin".
         jit_options : Optional
             Mapping of JIT options to use during module loading.
             (Default to no options)

From f3e746976978a30781b28259132e439ced00dff2 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Mon, 4 Nov 2024 12:54:49 -0800
Subject: [PATCH 12/13] Move init doc to class with a cleanup

---
 cuda_core/cuda/core/experimental/_device.py   | 31 +++++-----
 cuda_core/cuda/core/experimental/_event.py    | 10 ++-
 cuda_core/cuda/core/experimental/_launcher.py |  2 +
 cuda_core/cuda/core/experimental/_memory.py   | 23 +++----
 cuda_core/cuda/core/experimental/_module.py   | 62 +++++++++----------
 cuda_core/cuda/core/experimental/_program.py  | 24 +++----
 cuda_core/cuda/core/experimental/_stream.py   | 14 ++---
 7 files changed, 72 insertions(+), 94 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 7890ae53..2899282c 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -19,7 +19,7 @@
 
 
 class Device:
-    """Represents a GPU and acts as an entry point for cuda.core features.
+    """Represent a GPU and act as an entry point for cuda.core features.
 
     This is a singleton object that helps ensure interoperability
     across multiple libraries imported in the process to both see
@@ -30,26 +30,23 @@ class Device:
     resource created through this device, will continue to refer to
     this device's context.
 
-    """
-    __slots__ = ("_id", "_mr", "_has_inited")
-
-    def __new__(cls, device_id=None):
-        """Create and return a singleton :obj:`Device` object.
+    Newly returend :obj:`Device` object are is a thread-local singleton
+    for a specified device.
 
-        Creates and returns a thread-local singleton :obj:`Device` object
-        corresponding to a specific device.
+    Note
+    ----
+    Will not initialize the GPU.
 
-        Note
-        ----
-        Will not initialize the GPU.
+    Parameters
+    ----------
+    device_id : int, optional
+        Device ordinal to return a :obj:`Device` object for.
+        Default value of `None` return the currently used device.
 
-        Parameters
-        ----------
-        device_id : int, optional
-            Device ordinal to return a :obj:`Device` object for.
-            Default value of `None` return the currently used device.
+    """
+    __slots__ = ("_id", "_mr", "_has_inited")
 
-        """
+    def __new__(cls, device_id=None):
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(cudart.cudaGetDevice())
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index a1b03974..a6d5da28 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -36,7 +36,7 @@ class EventOptions:
 
 
 class Event:
-    """Represents a record of a specific point of execution within a CUDA stream.
+    """Represent a record at a specific point of execution within a CUDA stream.
 
     Applications can asynchronously record events at any point in
     the program. An event keeps a record of all previous work within
@@ -46,15 +46,13 @@ class Event:
     of work up to event's record, and help establish dependencies
     between GPU work submissions.
 
+    Directly creating an :obj:`Event` is not supported due to ambiguity,
+    and they should instead be created through a :obj:`Stream` object.
+
     """
     __slots__ = ("_handle", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
-        """Unsupported function due to ambiguity.
-
-        New events should instead be created through a :obj:`Stream` object.
-
-        """
         self._handle = None
         raise NotImplementedError(
             "directly creating an Event object can be ambiguous. Please call "
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index a3250227..2787a718 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -110,6 +110,8 @@ def launch(kernel, config, *kernel_args):
 
     Parameters
     ----------
+    kernel : :obj:`Kernel`
+        Kernel to launch.
     config : Any
         Launch configurations inline with options provided by
         :obj:`LaunchConfig` dataclass.
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 3eeea660..678f26ee 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -22,7 +22,7 @@
 
 
 class Buffer:
-    """Represents a handle to allocated memory.
+    """Represent a handle to allocated memory.
 
     This generic object provides a unified representation for how
     different memory resources are to give access to their memory
@@ -32,24 +32,21 @@ class Buffer:
     establishing both the DLPack and the Python-level buffer
     protocols.
 
+    Parameters
+    ----------
+    ptr : Any
+        Allocated buffer handle object
+    size : Any
+        Memory size of the buffer
+    mr : :obj:`MemoryResource`, optional
+        Memory resource associated with the buffer
+
     """
 
     # TODO: handle ownership? (_mr could be None)
     __slots__ = ("_ptr", "_size", "_mr",)
 
     def __init__(self, ptr, size, mr: MemoryResource=None):
-        """Initialize a new buffer object.
-
-        Parameters
-        ----------
-        ptr : Any
-            Allocated buffer handle object
-        size : Any
-            Memory size of the buffer
-        mr : :obj:`MemoryResource`, optional
-            Memory resource associated with the buffer
-
-        """
         self._ptr = ptr
         self._size = size
         self._mr = mr
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index b7802633..b587d813 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -46,17 +46,19 @@ def _lazy_init():
 
 
 class Kernel:
-    """Represents a compiled kernel that had been loaded onto the device.
+    """Represent a compiled kernel that had been loaded onto the device.
 
     Kernel instances can execution when passed directly into a
     launch function.
 
+    Directly creating a :obj:`Kernel` is not supported, and they
+    should instead be created through a :obj:`ObjectCode` object.
+
     """
 
     __slots__ = ("_handle", "_module",)
 
     def __init__(self):
-        """Unsupported function whose creation is intended through an :obj:`ObjectCode` object."""
         raise NotImplementedError("directly constructing a Kernel instance is not supported")
 
     @staticmethod
@@ -72,11 +74,34 @@ def _from_obj(obj, mod):
 
 
 class ObjectCode:
-    """Represents the compiled program loaded onto the device.
+    """Represent a compiled program that was loaded onto the device.
 
     This object provides a unified interface for different types of
     compiled programs that are loaded onto the device.
 
+    Loads the module library with specified module code and JIT options.
+
+    Note
+    ----
+    Usage under CUDA 11.x will only load to the current device
+    context.
+
+    Parameters
+    ----------
+    module : Union[bytes, str]
+        Either a bytes object containing the module to load, or
+        a file path string containing that module for loading.
+    code_type : Any
+        String of the compiled type.
+        Supported options are "ptx", "cubin" and "fatbin".
+    jit_options : Optional
+        Mapping of JIT options to use during module loading.
+        (Default to no options)
+    symbol_mapping : Optional
+        Keyword argument dictionary specifying how symbol names
+        should be mapped before trying to retrieve them.
+        (Default to no mappings)
+
     """
 
     __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
@@ -84,37 +109,6 @@ class ObjectCode:
 
     def __init__(self, module, code_type, jit_options=None, *,
                  symbol_mapping=None):
-        """Create and return a compiled program as an instance of an :obj:`ObjectCode`.
-
-        Loads the module library with specified module code and JIT options.
-
-        Note
-        ----
-        Usage under CUDA 11.x will only load to the current device
-        context.
-
-        Parameters
-        ----------
-        module : Union[bytes, str]
-            Either a bytes object containing the module to load, or
-            a file path string containing that module for loading.
-        code_type : Any
-            String of the compiled type.
-            Supported options are "ptx", "cubin" and "fatbin".
-        jit_options : Optional
-            Mapping of JIT options to use during module loading.
-            (Default to no options)
-        symbol_mapping : Optional
-            Keyword argument dictionary specifying how symbol names
-            should be mapped before trying to retrieve them.
-            (Default to no mappings)
-
-        Returns
-        -------
-        :obj:`ObjectCode`
-            Newly created :obj:`ObjectCode`.
-
-        """
         if code_type not in self._supported_code_type:
             raise ValueError
         _lazy_init()
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 9123fdee..a59324f8 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -8,12 +8,19 @@
 
 
 class Program:
-    """Represents the compilation machinery for processing programs into :obj:`ObjectCode`.
+    """Represent a compilation machinery to process programs into :obj:`ObjectCode`.
 
     This object provides a unified interface to multiple underlying
     compiler libraries. Compilation support is enabled for a wide
     range of code types and compilation types.
 
+    Parameters
+    ----------
+    code : Any
+        String of the CUDA Runtime Compilation program.
+    code_type : Any
+        String of the code type. Only "c++" is currently supported.
+
     """
 
     __slots__ = ("_handle", "_backend", )
@@ -21,21 +28,6 @@ class Program:
     _supported_target_type = ("ptx", "cubin", "ltoir", )
 
     def __init__(self, code, code_type):
-        """Create an instance of a :obj:`Program` object.
-
-        Parameters
-        ----------
-        code : Any
-            String of the CUDA Runtime Compilation program.
-        code_type : Any
-            String of the code type. Only "c++" is currently supported.
-
-        Returns
-        -------
-        :obj:`Program`
-            Newly created program object.
-
-        """
         self._handle = None
         if code_type not in self._supported_code_type:
             raise NotImplementedError
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 01f4fd1f..7f50dafd 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -36,7 +36,7 @@ class StreamOptions:
 
 
 class Stream:
-    """Represents a queue of GPU operations that are executed in a specific order.
+    """Represent a queue of GPU operations that are executed in a specific order.
 
     Applications use streams to control the order of execution for
     GPU work. Work within a single stream are executed sequentially.
@@ -46,19 +46,17 @@ class Stream:
     Advanced users can utilize default streams for enforce complex
     implicit synchronization behaviors.
 
+    Directly creating a :obj:`Stream` is not supported due to ambiguity.
+    New streams should instead be created through a :obj:`Device`
+    object, or created directly through using an existing handle
+    using Stream.from_handle().
+
     """
 
     __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin",
                  "_device_id", "_ctx_handle")
 
     def __init__(self):
-        """Unsupported function due to ambiguity.
-
-        New streams should instead be created through a :obj:`Device`
-        object, or created directly through using an existing handle
-        using Stream.from_handle()
-
-        """
         # minimal requirements for the destructor
         self._handle = None
         self._owner = None

From 7e256885bfe2b622c8dce79c96ddcda8b891ea1b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 6 Nov 2024 04:26:55 +0000
Subject: [PATCH 13/13] populate API references; clean up & polish docs

---
 cuda_core/cuda/core/experimental/_launcher.py | 10 +++----
 cuda_core/cuda/core/experimental/_module.py   |  4 +--
 cuda_core/cuda/core/experimental/_program.py  |  7 +++--
 .../source/_templates/autosummary/class.rst   | 26 +++++++++++++++++
 .../_templates/autosummary/dataclass.rst      | 10 +++++++
 .../_templates/autosummary/namedtuple.rst     |  8 ++++++
 cuda_core/docs/source/api.rst                 | 13 +++++++++
 cuda_core/docs/source/api_private.rst         | 28 +++++++++++++++++++
 cuda_core/docs/source/conf.py                 |  6 +++-
 cuda_core/docs/source/install.md              | 20 +++++--------
 cuda_core/docs/source/release/0.1.0-notes.md  | 16 ++++++-----
 cuda_python/docs/environment-docs.yml         |  1 +
 12 files changed, 117 insertions(+), 32 deletions(-)
 create mode 100644 cuda_core/docs/source/_templates/autosummary/class.rst
 create mode 100644 cuda_core/docs/source/_templates/autosummary/dataclass.rst
 create mode 100644 cuda_core/docs/source/_templates/autosummary/namedtuple.rst
 create mode 100644 cuda_core/docs/source/api_private.rst

diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index 2787a718..9991638f 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -103,16 +103,14 @@ def _cast_to_3_tuple(self, cfg):
 
 
 def launch(kernel, config, *kernel_args):
-    """Launches a :obj:`Kernel` object with launch-time configuration.
-
-    Invokes a :obj:`Kernel` object with specified launch-time
-    configurations.
+    """Launches a :obj:`~cuda.core.experimental._module.Kernel`
+    object with launch-time configuration.
 
     Parameters
     ----------
-    kernel : :obj:`Kernel`
+    kernel : :obj:`~cuda.core.experimental._module.Kernel`
         Kernel to launch.
-    config : Any
+    config : :obj:`LaunchConfig`
         Launch configurations inline with options provided by
         :obj:`LaunchConfig` dataclass.
     *kernel_args : Any
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index b587d813..8b0ff9a7 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -48,8 +48,8 @@ def _lazy_init():
 class Kernel:
     """Represent a compiled kernel that had been loaded onto the device.
 
-    Kernel instances can execution when passed directly into a
-    launch function.
+    Kernel instances can execution when passed directly into the
+    :func:`~cuda.core.experimental.launch` function.
 
     Directly creating a :obj:`Kernel` is not supported, and they
     should instead be created through a :obj:`ObjectCode` object.
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index a59324f8..5439c74a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -8,7 +8,8 @@
 
 
 class Program:
-    """Represent a compilation machinery to process programs into :obj:`ObjectCode`.
+    """Represent a compilation machinery to process programs into
+    :obj:`~cuda.core.experimental._module.ObjectCode`.
 
     This object provides a unified interface to multiple underlying
     compiler libraries. Compilation support is enabled for a wide
@@ -19,7 +20,7 @@ class Program:
     code : Any
         String of the CUDA Runtime Compilation program.
     code_type : Any
-        String of the code type. Only "c++" is currently supported.
+        String of the code type. Currently only ``"c++"`` is supported.
 
     """
 
@@ -74,7 +75,7 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
 
         Returns
         -------
-        :obj:`ObjectCode`
+        :obj:`~cuda.core.experimental._module.ObjectCode`
             Newly created code object.
 
         """
diff --git a/cuda_core/docs/source/_templates/autosummary/class.rst b/cuda_core/docs/source/_templates/autosummary/class.rst
new file mode 100644
index 00000000..b45a3fd5
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/class.rst
@@ -0,0 +1,26 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   {% for item in methods %}
+   .. automethod:: {{ item }}
+   {%- endfor %}
+
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   {% for item in attributes %}
+   .. autoattribute:: {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst
new file mode 100644
index 00000000..b8c35324
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst
@@ -0,0 +1,10 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block methods %}
+   .. automethod:: __init__
+   {% endblock %}
+
diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst
new file mode 100644
index 00000000..d3ad7d24
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst
@@ -0,0 +1,8 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members: __new__
+   :special-members: __new__
+   :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__
\ No newline at end of file
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 756ed776..1cb9811b 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -3,6 +3,11 @@
 ``cuda.core.experimental`` API Reference
 ========================================
 
+All of the APIs listed (or cross-referenced from) below are considered *experimental*
+and subject to future changes without deprecation notice. Once stablized they will be
+moved out of the ``experimental`` namespace.
+
+
 CUDA runtime
 ------------
 
@@ -10,6 +15,14 @@ CUDA runtime
    :toctree: generated/
 
    Device
+   launch
+
+   :template: dataclass.rst
+
+   EventOptions
+   StreamOptions
+   LaunchConfig
+
 
 CUDA compilation toolchain
 --------------------------
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
new file mode 100644
index 00000000..f100eb7c
--- /dev/null
+++ b/cuda_core/docs/source/api_private.rst
@@ -0,0 +1,28 @@
+:orphan:
+
+.. This page is to generate documentation for private classes exposed to users,
+   i.e., users cannot instantiate it by themselves but may use it's properties
+   or methods via returned values from public APIs. These classes must be referred
+   in public APIs returning their instances.
+
+.. currentmodule:: cuda.core.experimental
+
+CUDA runtime
+------------
+
+.. autosummary::
+   :toctree: generated/
+
+   _memory.Buffer
+   _stream.Stream
+   _event.Event
+
+
+CUDA compilation toolchain
+--------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   _module.Kernel
+   _module.ObjectCode
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 4be77656..5b28d331 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -34,7 +34,8 @@
     'sphinx.ext.autosummary',
 	'sphinx.ext.napoleon',
 	'myst_nb',
-	'enum_tools.autoenum'
+	'enum_tools.autoenum',
+    'sphinx_copybutton',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -77,3 +78,6 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+
+# skip cmdline prompts
+copybutton_exclude = '.linenos, .gp'
diff --git a/cuda_core/docs/source/install.md b/cuda_core/docs/source/install.md
index 593f7225..e13f37df 100644
--- a/cuda_core/docs/source/install.md
+++ b/cuda_core/docs/source/install.md
@@ -5,25 +5,19 @@
 `cuda.core` is supported on all platforms that CUDA is supported. Specific
 dependencies are as follows:
 
-* Driver: Linux (450.80.02 or later) Windows (456.38 or later)
-* CUDA Toolkit 12.0 to 12.6
+|                   | CUDA 11      | CUDA 12     |
+|------------------ | ------------ | ----------- |
+| CUDA Toolkit [^1] | 11.2 - 11.8  | 12.0 - 12.6 |
+| Driver            | 450.80.02+ (Linux), 452.39+ (Windows) | 525.60.13+ (Linux), 527.41+ (Windows) |
 
-
-## Installing from PyPI
-
-Coming soon!
-
-
-## Installing from Conda
-
-Coming soon!
+[^1]: Including `cuda-python`.
 
 
 ## Installing from Source
 
-```shell
+```console
 $ git clone https://github.com/NVIDIA/cuda-python
 $ cd cuda-python/cuda_core
 $ pip install .
 ```
-For now `cuda-python` (`cuda-bindings` later) is a required dependency.
+For now `cuda-python` (`cuda-bindings` later) 11.x or 12.x is a required dependency.
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md
index 2247e0cb..1ebb41f9 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.md
@@ -1,15 +1,17 @@
-# ``cuda.core`` Release notes
+# `cuda.core` Release notes
 
-Released on Oct XX, 2024
+Released on Nov XX, 2024
 
 ## Hightlights
-- Initial beta 1 release
+- Initial EA1 (early access) release
 - Supports all platforms that CUDA is supported
-- Supports all CUDA 12.x drivers
-- Supports all CUDA 12.x Toolkits
+- Supports all CUDA 11.x/12.x drivers
+- Supports all CUDA 11.x/12.x Toolkits
 - Pythonic CUDA runtime and other core functionalities
 
 ## Limitations
 
-- Source code release only; Python packages coming in a future release
-- Support for CUDA 11.x coming in the next release
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+- Source code release only; `pip`/`conda` support is coming in a future release
+- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
diff --git a/cuda_python/docs/environment-docs.yml b/cuda_python/docs/environment-docs.yml
index 2a3a8ad3..bc9588fd 100644
--- a/cuda_python/docs/environment-docs.yml
+++ b/cuda_python/docs/environment-docs.yml
@@ -11,6 +11,7 @@ dependencies:
   - pytest
   - scipy
   - sphinx
+  - sphinx-copybutton
   - pip:
     - furo
     - myst-nb