diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index a1f93e18..09a237a4 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -77,118 +77,92 @@ def _lazy_init(): @dataclass class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available - whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer - or older versions of cuda. + """Customizable :obj:`Linker` options. + Since the linker would choose to use nvJitLink or the driver APIs as the linking backed, + not all options are applicable. Attributes ---------- arch : str - Pass SM architecture value. Can use compute_ value instead if only generating PTX. + Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). This is a required option. - Acceptable value type: str - Maps to: -arch=sm_ max_register_count : int, optional Maximum register count. - Default: None - Acceptable value type: int - Maps to: -maxrregcount= + Maps to: ``-maxrregcount=``. time : bool, optional - Print timing information to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -time + Print timing information to the info log. + Maps to ``-time``. + Default: False. verbose : bool, optional - Print verbose messages to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -verbose + Print verbose messages to the info log. + Maps to ``-verbose``. + Default: False. link_time_optimization : bool, optional Perform link time optimization. - Default: False - Acceptable value type: bool - Maps to: -lto + Maps to: ``-lto``. + Default: False. ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with -lto. - Default: False - Acceptable value type: bool - Maps to: -ptx + Emit PTX after linking instead of CUBIN; only supported with ``-lto``. + Maps to ``-ptx``. + Default: False. optimization_level : int, optional Set optimization level. Only 0 and 3 are accepted. - Default: None - Acceptable value type: int - Maps to: -O + Maps to ``-O``. debug : bool, optional Generate debug information. - Default: False - Acceptable value type: bool - Maps to: -g + Maps to ``-g`` + Default: False. lineinfo : bool, optional Generate line information. - Default: False - Acceptable value type: bool - Maps to: -lineinfo + Maps to ``-lineinfo``. + Default: False. ftz : bool, optional Flush denormal values to zero. - Default: False - Acceptable value type: bool - Maps to: -ftz= + Maps to ``-ftz=``. + Default: False. prec_div : bool, optional Use precise division. - Default: True - Acceptable value type: bool - Maps to: -prec-div= + Maps to ``-prec-div=``. + Default: True. prec_sqrt : bool, optional Use precise square root. - Default: True - Acceptable value type: bool - Maps to: -prec-sqrt= + Maps to ``-prec-sqrt=``. + Default: True. fma : bool, optional Use fast multiply-add. - Default: True - Acceptable value type: bool - Maps to: -fma= + Maps to ``-fma=``. + Default: True. kernels_used : List[str], optional Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple times. - Default: None - Acceptable value type: list of str - Maps to: -kernels-used= + Maps to ``-kernels-used=``. variables_used : List[str], optional - Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -variables-used= + Pass a list of variables that are used; any not in the list can be removed. + Maps to ``-variables-used=`` optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. - Default: False - Acceptable value type: bool - Maps to: -optimize-unused-variables + Maps to: ``-optimize-unused-variables`` + Default: False. xptxas : List[str], optional - Pass options to PTXAS. This option can be called multiple times. - Default: None - Acceptable value type: list of str - Maps to: -Xptxas= + Pass options to PTXAS. + Maps to: ``-Xptxas=``. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split compilation (default). - Default: 1 - Acceptable value type: int - Maps to: -split-compile= + Maps to ``-split-compile=``. + Default: 1. split_compile_extended : int, optional A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This option can potentially impact performance of the compiled binary. - Default: 1 - Acceptable value type: int - Maps to: -split-compile-extended= + Maps to ``-split-compile-extended=``. + Default: 1. no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. - Default: False - Acceptable value type: bool - Maps to: -no-cache + Maps to ``-no-cache``. + Default: False. """ arch: str @@ -351,8 +325,11 @@ def _exception_manager(self): class Linker: - """ - Linker class for managing the linking of object codes with specified options. + """Represent a linking machinery to link one or multiple object codes into + :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options. + + This object provides a unified interface to multiple underlying + linker libraries (such as nvJitLink or cuLink* from CUDA driver). Parameters ---------- @@ -442,7 +419,7 @@ def link(self, target_type) -> ObjectCode: Note ------ - See nvrtc compiler options documnetation to ensure the input ObjectCodes are + See nvrtc compiler options documnetation to ensure the input object codes are correctly compiled for linking. """ if target_type not in ("cubin", "ptx"): @@ -470,7 +447,8 @@ def get_error_log(self) -> str: Returns ------- - The error log. + str + The error log. """ if _nvjitlink: log_size = _nvjitlink.get_error_log_size(self._mnff.handle) @@ -485,7 +463,8 @@ def get_info_log(self) -> str: Returns ------- - The info log. + str + The info log. """ if _nvjitlink: log_size = _nvjitlink.get_info_log_size(self._mnff.handle) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index cd3530b9..34cad7d1 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -1,13 +1,14 @@ # `cuda.core` Release notes -Released on Nov , 2024 +Released on Dec XX, 2024 ## Hightlights - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API - depending on your CUDA version. +- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`s. Under + the hood, it uses either the nvJitLink or cuLink APIs depending on the CUDA version detected + in the current environment. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools @@ -15,6 +16,6 @@ Released on Nov , 2024 - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedbacks with us so that we can make `cuda.core` better! -- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, +- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, the backend is the cuLink api which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6