diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 65d5fe9b..2899282c 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -19,7 +19,31 @@ class Device: + """Represent a GPU and act as an entry point for cuda.core features. + This is a singleton object that helps ensure interoperability + across multiple libraries imported in the process to both see + and use the same GPU device. + + While acting as the entry point, many other CUDA resources can be + allocated such as streams and buffers. Any :obj:`Context` dependent + resource created through this device, will continue to refer to + this device's context. + + Newly returend :obj:`Device` object are is a thread-local singleton + for a specified device. + + Note + ---- + Will not initialize the GPU. + + Parameters + ---------- + device_id : int, optional + Device ordinal to return a :obj:`Device` object for. + Default value of `None` return the currently used device. + + """ __slots__ = ("_id", "_mr", "_has_inited") def __new__(cls, device_id=None): @@ -54,15 +78,29 @@ def _check_context_initialized(self, *args, **kwargs): @property def device_id(self) -> int: + """Return device ordinal.""" return self._id @property def pci_bus_id(self) -> str: + """Return a PCI Bus Id string for this device.""" bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) return bus_id[:12].decode() @property def uuid(self) -> str: + """Return a UUID for the device. + + Returns 16-octets identifying the device. If the device is in + MIG mode, returns its MIG UUID which uniquely identifies the + subscribed MIG compute instance. + + Note + ---- + MIG UUID is only returned when device is in MIG mode and the + driver is older than CUDA 11.4. + + """ driver_ver = handle_return(cuda.cuDriverGetVersion()) if driver_ver >= 11040: uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id)) @@ -74,19 +112,21 @@ def uuid(self) -> str: @property def name(self) -> str: - # assuming a GPU name is less than 128 characters... - name = handle_return(cuda.cuDeviceGetName(128, self._id)) + """Return the device name.""" + # Use 256 characters to be consistent with CUDA Runtime + name = handle_return(cuda.cuDeviceGetName(256, self._id)) name = name.split(b'\0')[0] return name.decode() @property def properties(self) -> dict: + """Return information about the compute-device.""" # TODO: pythonize the key names return handle_return(cudart.cudaGetDeviceProperties(self._id)) @property def compute_capability(self) -> ComputeCapability: - """Returns a named tuple with 2 fields: major and minor. """ + """Return a named tuple with 2 fields: major and minor.""" major = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) minor = handle_return(cudart.cudaDeviceGetAttribute( @@ -96,12 +136,20 @@ def compute_capability(self) -> ComputeCapability: @property @precondition(_check_context_initialized) def context(self) -> Context: + """Return the current :obj:`Context` associated with this device. + + Note + ---- + Device must be initialized. + + """ ctx = handle_return(cuda.cuCtxGetCurrent()) assert int(ctx) != 0 return Context._from_ctx(ctx, self._id) @property def memory_resource(self) -> MemoryResource: + """Return :obj:`MemoryResource` associated with this device.""" return self._mr @memory_resource.setter @@ -112,27 +160,53 @@ def memory_resource(self, mr): @property def default_stream(self) -> Stream: + """Return default CUDA :obj:`Stream` associated with this device. + + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. + + """ return default_stream() def __int__(self): + """Return device_id.""" return self._id def __repr__(self): return f"" def set_current(self, ctx: Context=None) -> Union[Context, None]: - """ - Entry point of this object. Users always start a code by + """Set device to be used for GPU executions. + + Initializes CUDA and sets the calling thread to a valid CUDA + context. By default the primary context is used, but optional `ctx` + parameter can be used to explicitly supply a :obj:`Context` object. + + Providing a `ctx` causes the previous set context to be popped and returned. + + Parameters + ---------- + ctx : :obj:`Context`, optional + Optional context to push onto this device's current thread stack. + + Returns + ------- + Union[:obj:`Context`, None], optional + Popped context. + + Examples + -------- + Acts as an entry point of this object. Users always start a code by calling this method, e.g. - + >>> from cuda.core.experimental import Device >>> dev0 = Device(0) >>> dev0.set_current() >>> # ... do work on device 0 ... - - The optional ctx argument is for advanced users to bind a - CUDA context with the device. In this case, the previously - set context is popped and returned to the user. + """ if ctx is not None: if not isinstance(ctx, Context): @@ -163,25 +237,94 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]: self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: - # Create a Context object (but do NOT set it current yet!). - # ContextOptions is a dataclass for setting e.g. affinity or CIG - # options. + """Create a new :obj:`Context` object. + + Note + ---- + The newly context will not be set as current. + + Parameters + ---------- + options : :obj:`ContextOptions`, optional + Customizable dataclass for context creation options. + + Returns + ------- + :obj:`Context` + Newly created context object. + + """ raise NotImplementedError("TODO") @precondition(_check_context_initialized) def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: - # Create a Stream object by either holding a newly created - # CUDA stream or wrapping an existing foreign object supporting - # the __cuda_stream__ protocol. In the latter case, a reference - # to obj is held internally so that its lifetime is managed. + """Create a Stream object. + + New stream objects can be created in two different ways: + + 1) Create a new CUDA stream with customizable `options`. + 2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol. + + Option (2) internally holds a reference to the foreign object + such that the lifetime is managed. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + obj : Any, optional + Any object supporting the __cuda_stream__ protocol. + options : :obj:`StreamOptions`, optional + Customizable dataclass for stream creation options. + + Returns + ------- + :obj:`Stream` + Newly created stream object. + + """ return Stream._init(obj=obj, options=options) @precondition(_check_context_initialized) def allocate(self, size, stream=None) -> Buffer: + """Allocate device memory from a specified stream. + + Allocates device memory of `size` bytes on the specified `stream` + using the memory resource currently associated with this Device. + + Parameter `stream` is optional, using a default stream by default. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + size : int + Number of bytes to allocate. + stream : :obj:`Stream`, optional + The stream establishing the stream ordering semantic. + Default value of `None` uses default stream. + + Returns + ------- + :obj:`Buffer` + Newly created buffer object. + + """ if stream is None: stream = default_stream() return self._mr.allocate(size, stream) @precondition(_check_context_initialized) def sync(self): + """Synchronize the device. + + Note + ---- + Device must be initialized. + + """ handle_return(cudart.cudaDeviceSynchronize()) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 3c85d9fe..a6d5da28 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -13,17 +13,46 @@ @dataclass class EventOptions: + """Customizable :obj:`Event` options. + + Attributes + ---------- + enable_timing : bool, optional + Event will record timing data. (Default to False) + busy_waited_sync : bool, optional + If True, event will use blocking synchronization. When a CPU + thread calls synchronize, the call will block until the event + has actually been completed. + Otherwise, the CPU thread will busy-wait until the event has + been completed. (Default to False) + support_ipc : bool, optional + Event will be suitable for interprocess use. + Note that enable_timing must be False. (Default to False) + + """ enable_timing: Optional[bool] = False busy_waited_sync: Optional[bool] = False support_ipc: Optional[bool] = False class Event: + """Represent a record at a specific point of execution within a CUDA stream. + Applications can asynchronously record events at any point in + the program. An event keeps a record of all previous work within + the last recorded stream. + + Events can be used to monitor device's progress, query completion + of work up to event's record, and help establish dependencies + between GPU work submissions. + + Directly creating an :obj:`Event` is not supported due to ambiguity, + and they should instead be created through a :obj:`Stream` object. + + """ __slots__ = ("_handle", "_timing_disabled", "_busy_waited") def __init__(self): - # minimal requirements for the destructor self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call " @@ -51,37 +80,45 @@ def _init(options: Optional[EventOptions]=None): return self def __del__(self): + """Return close(self)""" self.close() def close(self): - # Destroy the event. + """Destroy the event.""" if self._handle: handle_return(cuda.cuEventDestroy(self._handle)) self._handle = None @property def is_timing_disabled(self) -> bool: - # Check if this instance can be used for the timing purpose. + """Return True if the event does not record timing data, otherwise False.""" return self._timing_disabled @property def is_sync_busy_waited(self) -> bool: - # Check if the event synchronization would keep the CPU busy-waiting. + """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False.""" return self._busy_waited @property def is_ipc_supported(self) -> bool: - # Check if this instance can be used for IPC. + """Return True if this event can be used as an interprocess event, otherwise False.""" raise NotImplementedError("TODO") def sync(self): - # Sync over the event. + """Synchronize until the event completes. + + If the event was created with busy_waited_sync, then the + calling CPU thread will block until the event has been + completed by the device. + Otherwise the CPU thread will busy-wait until the event + has been completed. + + """ handle_return(cuda.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: - # Return True if all captured works have been completed, - # otherwise False. + """Return True if all captured works have been completed, otherwise False.""" result, = cuda.cuEventQuery(self._handle) if result == cuda.CUresult.CUDA_SUCCESS: return True @@ -92,4 +129,5 @@ def is_done(self) -> bool: @property def handle(self) -> int: + """Return the underlying cudaEvent_t pointer address as Python int.""" return int(self._handle) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 4b9533cb..9991638f 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -37,7 +37,23 @@ def _lazy_init(): @dataclass class LaunchConfig: - """ + """Customizable launch options. + + Attributes + ---------- + grid : Union[tuple, int] + Collection of threads that will execute a kernel function. + block : Union[tuple, int] + Group of threads (Thread Block) that will execute on the same + multiprocessor. Threads within a thread blocks have access to + shared memory and can be explicitly synchronized. + stream : :obj:`Stream` + The stream establishing the stream ordering semantic of a + launch. + shmem_size : int, optional + Dynamic shared-memory size per thread block in bytes. + (Default to size 0) + """ # TODO: expand LaunchConfig to include other attributes grid: Union[tuple, int] = None @@ -87,6 +103,21 @@ def _cast_to_3_tuple(self, cfg): def launch(kernel, config, *kernel_args): + """Launches a :obj:`~cuda.core.experimental._module.Kernel` + object with launch-time configuration. + + Parameters + ---------- + kernel : :obj:`~cuda.core.experimental._module.Kernel` + Kernel to launch. + config : :obj:`LaunchConfig` + Launch configurations inline with options provided by + :obj:`LaunchConfig` dataclass. + *kernel_args : Any + Variable length argument list that is provided to the + launching kernel. + + """ if not isinstance(kernel, Kernel): raise ValueError config = check_or_create_options(LaunchConfig, config, "launch config") diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 4ef2cbc3..678f26ee 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -22,6 +22,26 @@ class Buffer: + """Represent a handle to allocated memory. + + This generic object provides a unified representation for how + different memory resources are to give access to their memory + allocations. + + Support for data interchange mechanisms are provided by + establishing both the DLPack and the Python-level buffer + protocols. + + Parameters + ---------- + ptr : Any + Allocated buffer handle object + size : Any + Memory size of the buffer + mr : :obj:`MemoryResource`, optional + Memory resource associated with the buffer + + """ # TODO: handle ownership? (_mr could be None) __slots__ = ("_ptr", "_size", "_mr",) @@ -32,9 +52,23 @@ def __init__(self, ptr, size, mr: MemoryResource=None): self._mr = mr def __del__(self): - self.close(default_stream()) + """Return close(self).""" + self.close() def close(self, stream=None): + """Deallocate this buffer asynchronously on the given stream. + + This buffer is released back to their memory resource + asynchronously on the given stream. + + Parameters + ---------- + stream : Any, optional + The stream object with a __cuda_stream__ protocol to + use for asynchronous deallocation. Defaults to using + the default stream. + + """ if self._ptr and self._mr is not None: if stream is None: stream = default_stream() @@ -44,42 +78,56 @@ def close(self, stream=None): @property def handle(self): + """Return the buffer handle object.""" return self._ptr @property def size(self): + """Return the memory size of this buffer.""" return self._size @property def memory_resource(self) -> MemoryResource: - # Return the memory resource from which this buffer was allocated. + """Return the memory resource associated with this buffer.""" return self._mr @property def is_device_accessible(self) -> bool: - # Check if this buffer can be accessed from GPUs. + """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._mr is not None: return self._mr.is_device_accessible raise NotImplementedError @property def is_host_accessible(self) -> bool: - # Check if this buffer can be accessed from CPUs. + """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._mr is not None: return self._mr.is_host_accessible raise NotImplementedError @property def device_id(self) -> int: + """Return the device ordinal of this buffer.""" if self._mr is not None: return self._mr.device_id raise NotImplementedError def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: - # Copy from this buffer to the dst buffer asynchronously on the - # given stream. The dst buffer is returned. If the dst is not provided, - # allocate one from self.memory_resource. Raise an exception if the - # stream is not provided. + """Copy from this buffer to the dst buffer asynchronously on the given stream. + + Copies the data from this buffer to the provided dst buffer. + If the dst buffer is not provided, then a new buffer is first + allocated using the associated memory resource before the copy. + + Parameters + ---------- + dst : :obj:`Buffer` + Source buffer to copy data from + stream : Any + Keyword argument specifying the stream for the + asynchronous copy + + """ if stream is None: raise ValueError("stream must be provided") if dst is None: @@ -93,8 +141,17 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: return dst def copy_from(self, src: Buffer, *, stream): - # Copy from the src buffer to this buffer asynchronously on the - # given stream. Raise an exception if the stream is not provided. + """Copy from the src buffer to this buffer asynchronously on the given stream. + + Parameters + ---------- + src : :obj:`Buffer` + Source buffer to copy data from + stream : Any + Keyword argument specifying the stream for the + asynchronous copy + + """ if stream is None: raise ValueError("stream must be provided") if src._size != self._size: @@ -141,7 +198,7 @@ def __buffer__(self, flags: int, /) -> memoryview: raise NotImplementedError("TODO") def __release_buffer__(self, buffer: memoryview, /): - # Supporting methond paired with __buffer__. + # Supporting method paired with __buffer__. raise NotImplementedError("TODO") diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 60d4db97..8b0ff9a7 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -46,6 +46,15 @@ def _lazy_init(): class Kernel: + """Represent a compiled kernel that had been loaded onto the device. + + Kernel instances can execution when passed directly into the + :func:`~cuda.core.experimental.launch` function. + + Directly creating a :obj:`Kernel` is not supported, and they + should instead be created through a :obj:`ObjectCode` object. + + """ __slots__ = ("_handle", "_module",) @@ -65,6 +74,35 @@ def _from_obj(obj, mod): class ObjectCode: + """Represent a compiled program that was loaded onto the device. + + This object provides a unified interface for different types of + compiled programs that are loaded onto the device. + + Loads the module library with specified module code and JIT options. + + Note + ---- + Usage under CUDA 11.x will only load to the current device + context. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the module to load, or + a file path string containing that module for loading. + code_type : Any + String of the compiled type. + Supported options are "ptx", "cubin" and "fatbin". + jit_options : Optional + Mapping of JIT options to use during module loading. + (Default to no options) + symbol_mapping : Optional + Keyword argument dictionary specifying how symbol names + should be mapped before trying to retrieve them. + (Default to no mappings) + + """ __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "fatbin") @@ -107,6 +145,19 @@ def __del__(self): pass def get_kernel(self, name): + """Return the :obj:`Kernel` of a specified name from this object code. + + Parameters + ---------- + name : Any + Name of the kernel to retrieve. + + Returns + ------- + :obj:`Kernel` + Newly created kernel object. + + """ try: name = self._sym_map[name] except KeyError: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index ec0778a3..5439c74a 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -8,6 +8,21 @@ class Program: + """Represent a compilation machinery to process programs into + :obj:`~cuda.core.experimental._module.ObjectCode`. + + This object provides a unified interface to multiple underlying + compiler libraries. Compilation support is enabled for a wide + range of code types and compilation types. + + Parameters + ---------- + code : Any + String of the CUDA Runtime Compilation program. + code_type : Any + String of the code type. Currently only ``"c++"`` is supported. + + """ __slots__ = ("_handle", "_backend", ) _supported_code_type = ("c++", ) @@ -30,14 +45,40 @@ def __init__(self, code, code_type): raise NotImplementedError def __del__(self): + """Return close(self).""" self.close() def close(self): + """Destroy this program.""" if self._handle is not None: handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) self._handle = None def compile(self, target_type, options=(), name_expressions=(), logs=None): + """Compile the program with a specific compilation type. + + Parameters + ---------- + target_type : Any + String of the targeted compilation type. + Supported options are "ptx", "cubin" and "ltoir". + options : Union[List, Tuple], optional + List of compilation options associated with the backend + of this :obj:`Program`. (Default to no options) + name_expressions : Union[List, Tuple], optional + List of explicit name expressions to become accessible. + (Default to no expressions) + logs : Any, optional + Object with a write method to receive the logs generated + from compilation. + (Default to no logs) + + Returns + ------- + :obj:`~cuda.core.experimental._module.ObjectCode` + Newly created code object. + + """ if target_type not in self._supported_target_type: raise NotImplementedError @@ -80,8 +121,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): @property def backend(self): + """Return the backend type string associated with this program.""" return self._backend @property def handle(self): + """Return the program handle object.""" return self._handle diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 95f8ec50..7f50dafd 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -20,12 +20,38 @@ @dataclass class StreamOptions: + """Customizable :obj:`Stream` options. + Attributes + ---------- + nonblocking : bool, optional + Stream does not synchronize with the NULL stream. (Default to True) + priority : int, optional + Stream priority where lower number represents a + higher priority. (Default to lowest priority) + + """ nonblocking: bool = True priority: Optional[int] = None class Stream: + """Represent a queue of GPU operations that are executed in a specific order. + + Applications use streams to control the order of execution for + GPU work. Work within a single stream are executed sequentially. + Whereas work across multiple streams can be further controlled + using stream priorities and :obj:`Event` managements. + + Advanced users can utilize default streams for enforce complex + implicit synchronization behaviors. + + Directly creating a :obj:`Stream` is not supported due to ambiguity. + New streams should instead be created through a :obj:`Device` + object, or created directly through using an existing handle + using Stream.from_handle(). + + """ __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", "_device_id", "_ctx_handle") @@ -74,13 +100,12 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): else: flags = cuda.CUstream_flags.CU_STREAM_DEFAULT + high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange()) if priority is not None: - high, low = handle_return( - cudart.cudaDeviceGetStreamPriorityRange()) if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: - priority = 0 + priority = high self._handle = handle_return( cuda.cuStreamCreateWithPriority(flags, priority)) @@ -94,9 +119,16 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): return self def __del__(self): + """Return close(self).""" self.close() def close(self): + """Destroy the stream. + + Destroy the stream if we own it. Borrowed foreign stream + object will instead have their references released. + + """ if self._owner is None: if self._handle and not self._builtin: handle_return(cuda.cuStreamDestroy(self._handle)) @@ -106,15 +138,17 @@ def close(self): @property def __cuda_stream__(self) -> Tuple[int, int]: + """Return an instance of a __cuda_stream__ protocol.""" return (0, int(self._handle)) @property def handle(self) -> int: - # Return the underlying cudaStream_t pointer address as Python int. + """Return the underlying cudaStream_t pointer address as Python int.""" return int(self._handle) @property def is_nonblocking(self) -> bool: + """Return True if this is a nonblocking stream, otherwise False.""" if self._nonblocking is None: flag = handle_return(cuda.cuStreamGetFlags(self._handle)) if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING: @@ -125,15 +159,35 @@ def is_nonblocking(self) -> bool: @property def priority(self) -> int: + """Return the stream priority.""" if self._priority is None: prio = handle_return(cuda.cuStreamGetPriority(self._handle)) self._priority = prio return self._priority def sync(self): + """Synchronize the stream.""" handle_return(cuda.cuStreamSynchronize(self._handle)) def record(self, event: Event=None, options: EventOptions=None) -> Event: + """Record an event onto the stream. + + Creates an Event object (or reuses the given one) by + recording on the stream. + + Parameters + ---------- + event : :obj:`Event`, optional + Optional event object to be reused for recording. + options : :obj:`EventOptions`, optional + Customizable dataclass for event creation options. + + Returns + ------- + :obj:`Event` + Newly created event object. + + """ # Create an Event object (or reusing the given one) by recording # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. @@ -145,11 +199,15 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event: return event def wait(self, event_or_stream: Union[Event, Stream]): - # Wait for a CUDA event or a CUDA stream to establish a stream order. - # - # If a Stream instance is provided, the effect is as if an event is - # recorded on the given stream, and then self waits on the recorded - # event. + """Wait for a CUDA event or a CUDA stream. + + Waiting for an event or a stream establishes a stream order. + + If a :obj:`Stream` is provided, then wait until the stream's + work is completed. This is done by recording a new :obj:`Event` + on the stream and then waiting on it. + + """ if isinstance(event_or_stream, Event): event = event_or_stream.handle discard_event = False @@ -175,12 +233,15 @@ def wait(self, event_or_stream: Union[Event, Stream]): @property def device(self) -> Device: - # Inverse look-up to find on which device this stream instance was - # created. - # - # Note that Stream.device.context might not necessarily agree with - # Stream.context, in cases where a different CUDA context is set - # current after a stream was created. + """Return the :obj:`Device` singleton associated with this stream. + + Note + ---- + The current context on the device may differ from this + stream's context. This case occurs when a different CUDA + context is set current after a stream is created. + + """ from cuda.core.experimental._device import Device # avoid circular import if self._device_id is None: # Get the stream context first @@ -192,8 +253,7 @@ def device(self) -> Device: @property def context(self) -> Context: - # Inverse look-up to find in which CUDA context this stream instance - # was created + """Return the :obj:`Context` associated with this stream.""" if self._ctx_handle is None: self._ctx_handle = handle_return( cuda.cuStreamGetCtx(self._handle)) @@ -203,6 +263,28 @@ def context(self) -> Context: @staticmethod def from_handle(handle: int) -> Stream: + """Create a new :obj:`Stream` object from a foreign stream handle. + + Uses a cudaStream_t pointer address represented as a Python int + to create a new :obj:`Stream` object. + + Note + ---- + Stream lifetime is not managed, foreign object must remain + alive while this steam is active. + + Parameters + ---------- + handle : int + Stream handle representing the address of a foreign + stream object. + + Returns + ------- + :obj:`Stream` + Newly created stream object. + + """ class _stream_holder: @property def __cuda_stream__(self): @@ -235,6 +317,15 @@ def __init__(self): def default_stream(): + """Return the default CUDA :obj:`Stream`. + + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. + + """ # TODO: flip the default use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0)) if use_ptds: diff --git a/cuda_core/docs/source/_templates/autosummary/class.rst b/cuda_core/docs/source/_templates/autosummary/class.rst new file mode 100644 index 00000000..b45a3fd5 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/class.rst @@ -0,0 +1,26 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + .. rubric:: {{ _('Methods') }} + + {% for item in methods %} + .. automethod:: {{ item }} + {%- endfor %} + + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + {% for item in attributes %} + .. autoattribute:: {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst new file mode 100644 index 00000000..b8c35324 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst @@ -0,0 +1,10 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + .. automethod:: __init__ + {% endblock %} + diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst new file mode 100644 index 00000000..d3ad7d24 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst @@ -0,0 +1,8 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: __new__ + :special-members: __new__ + :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ \ No newline at end of file diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 756ed776..1cb9811b 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -3,6 +3,11 @@ ``cuda.core.experimental`` API Reference ======================================== +All of the APIs listed (or cross-referenced from) below are considered *experimental* +and subject to future changes without deprecation notice. Once stablized they will be +moved out of the ``experimental`` namespace. + + CUDA runtime ------------ @@ -10,6 +15,14 @@ CUDA runtime :toctree: generated/ Device + launch + + :template: dataclass.rst + + EventOptions + StreamOptions + LaunchConfig + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst new file mode 100644 index 00000000..f100eb7c --- /dev/null +++ b/cuda_core/docs/source/api_private.rst @@ -0,0 +1,28 @@ +:orphan: + +.. This page is to generate documentation for private classes exposed to users, + i.e., users cannot instantiate it by themselves but may use it's properties + or methods via returned values from public APIs. These classes must be referred + in public APIs returning their instances. + +.. currentmodule:: cuda.core.experimental + +CUDA runtime +------------ + +.. autosummary:: + :toctree: generated/ + + _memory.Buffer + _stream.Stream + _event.Event + + +CUDA compilation toolchain +-------------------------- + +.. autosummary:: + :toctree: generated/ + + _module.Kernel + _module.ObjectCode diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 4be77656..5b28d331 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -34,7 +34,8 @@ 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', 'myst_nb', - 'enum_tools.autoenum' + 'enum_tools.autoenum', + 'sphinx_copybutton', ] # Add any paths that contain templates here, relative to this directory. @@ -77,3 +78,6 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] + +# skip cmdline prompts +copybutton_exclude = '.linenos, .gp' diff --git a/cuda_core/docs/source/install.md b/cuda_core/docs/source/install.md index 593f7225..e13f37df 100644 --- a/cuda_core/docs/source/install.md +++ b/cuda_core/docs/source/install.md @@ -5,25 +5,19 @@ `cuda.core` is supported on all platforms that CUDA is supported. Specific dependencies are as follows: -* Driver: Linux (450.80.02 or later) Windows (456.38 or later) -* CUDA Toolkit 12.0 to 12.6 +| | CUDA 11 | CUDA 12 | +|------------------ | ------------ | ----------- | +| CUDA Toolkit [^1] | 11.2 - 11.8 | 12.0 - 12.6 | +| Driver | 450.80.02+ (Linux), 452.39+ (Windows) | 525.60.13+ (Linux), 527.41+ (Windows) | - -## Installing from PyPI - -Coming soon! - - -## Installing from Conda - -Coming soon! +[^1]: Including `cuda-python`. ## Installing from Source -```shell +```console $ git clone https://github.com/NVIDIA/cuda-python $ cd cuda-python/cuda_core $ pip install . ``` -For now `cuda-python` (`cuda-bindings` later) is a required dependency. +For now `cuda-python` (`cuda-bindings` later) 11.x or 12.x is a required dependency. diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 2247e0cb..1ebb41f9 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,15 +1,17 @@ -# ``cuda.core`` Release notes +# `cuda.core` Release notes -Released on Oct XX, 2024 +Released on Nov XX, 2024 ## Hightlights -- Initial beta 1 release +- Initial EA1 (early access) release - Supports all platforms that CUDA is supported -- Supports all CUDA 12.x drivers -- Supports all CUDA 12.x Toolkits +- Supports all CUDA 11.x/12.x drivers +- Supports all CUDA 11.x/12.x Toolkits - Pythonic CUDA runtime and other core functionalities ## Limitations -- Source code release only; Python packages coming in a future release -- Support for CUDA 11.x coming in the next release +- All APIs are currently *experimental* and subject to change without deprecation notice. + Please kindly share your feedbacks with us so that we can make `cuda.core` better! +- Source code release only; `pip`/`conda` support is coming in a future release +- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) diff --git a/cuda_python/docs/environment-docs.yml b/cuda_python/docs/environment-docs.yml index 2a3a8ad3..bc9588fd 100644 --- a/cuda_python/docs/environment-docs.yml +++ b/cuda_python/docs/environment-docs.yml @@ -11,6 +11,7 @@ dependencies: - pytest - scipy - sphinx + - sphinx-copybutton - pip: - furo - myst-nb