From 9d8ecbc19d3cef81ab877890b00e912a548ddd6b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:06:26 -0800 Subject: [PATCH 001/111] integrate ruff changes --- cuda_core/cuda/core/experimental/__init__.py | 1 + cuda_core/cuda/core/experimental/_linker.py | 285 +++++++++++++++++++ cuda_core/docs/source/api.rst | 5 + cuda_core/docs/source/release.md | 1 + cuda_core/docs/source/release/0.1.0-notes.md | 4 +- cuda_core/docs/source/release/0.2.0-notes.md | 11 + cuda_core/tests/test_linker.py | 101 +++++++ 7 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_linker.py create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md create mode 100644 cuda_core/tests/test_linker.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9b978398..12fed225 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -5,5 +5,6 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch +from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py new file mode 100644 index 00000000..e80bfe61 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -0,0 +1,285 @@ +from dataclasses import dataclass +from typing import List, Optional + +from cuda.bindings import nvjitlink +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._utils import check_or_create_options + + +@dataclass +class LinkerOptions: + """Customizable :obj:`LinkerOptions` for nvJitLink. + + Attributes + ---------- + arch : str + Pass SM architecture value. Can use compute_ value instead if only generating PTX. + This is a required option. + Acceptable value type: str + Maps to: -arch=sm_ + max_register_count : int, optional + Maximum register count. + Default: None + Acceptable value type: int + Maps to: -maxrregcount= + time : bool, optional + Print timing information to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -time + verbose : bool, optional + Print verbose messages to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -verbose + link_time_optimization : bool, optional + Perform link time optimization. + Default: False + Acceptable value type: bool + Maps to: -lto + ptx : bool, optional + Emit PTX after linking instead of CUBIN; only supported with -lto. + Default: False + Acceptable value type: bool + Maps to: -ptx + optimization_level : int, optional + Set optimization level. Only 0 and 3 are accepted. + Default: None + Acceptable value type: int + Maps to: -O + debug : bool, optional + Generate debug information. + Default: False + Acceptable value type: bool + Maps to: -g + lineinfo : bool, optional + Generate line information. + Default: False + Acceptable value type: bool + Maps to: -lineinfo + ftz : bool, optional + Flush denormal values to zero. + Default: False + Acceptable value type: bool + Maps to: -ftz= + prec_div : bool, optional + Use precise division. + Default: True + Acceptable value type: bool + Maps to: -prec-div= + prec_sqrt : bool, optional + Use precise square root. + Default: True + Acceptable value type: bool + Maps to: -prec-sqrt= + fma : bool, optional + Use fast multiply-add. + Default: True + Acceptable value type: bool + Maps to: -fma= + kernels_used : List[str], optional + Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -kernels-used= + variables_used : List[str], optional + Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -variables-used= + optimize_unused_variables : bool, optional + Assume that if a variable is not referenced in device code, it can be removed. + Default: False + Acceptable value type: bool + Maps to: -optimize-unused-variables + xptxas : List[str], optional + Pass options to PTXAS. This option can be called multiple times. + Default: None + Acceptable value type: list of str + Maps to: -Xptxas= + split_compile : int, optional + Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split + compilation (default). + Default: 1 + Acceptable value type: int + Maps to: -split-compile= + split_compile_extended : int, optional + A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. + Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This + option can potentially impact performance of the compiled binary. + Default: 1 + Acceptable value type: int + Maps to: -split-compile-extended= + jump_table_density : int, optional + When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to + determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value + is 101. The percentage ranges from 0 to 101 inclusively. + Default: 101 + Acceptable value type: int + Maps to: -jump-table-density= + no_cache : bool, optional + Do not cache the intermediate steps of nvJitLink. + Default: False + Acceptable value type: bool + Maps to: -no-cache + device_stack_protector : bool, optional + Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory + safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in + each function. Only those functions which are deemed high-risk make use of a stack canary. + Default: False + Acceptable value type: bool + Maps to: -device-stack-protector + """ + + arch: str + max_register_count: Optional[int] = None + time: Optional[bool] = None + verbose: Optional[bool] = None + link_time_optimization: Optional[bool] = None + ptx: Optional[bool] = None + optimization_level: Optional[int] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + ftz: Optional[bool] = None + prec_div: Optional[bool] = None + prec_sqrt: Optional[bool] = None + fma: Optional[bool] = None + kernels_used: Optional[List[str]] = None + variables_used: Optional[List[str]] = None + optimize_unused_variables: Optional[bool] = None + xptxas: Optional[List[str]] = None + split_compile: Optional[int] = None + split_compile_extended: Optional[int] = None + jump_table_density: Optional[int] = None + no_cache: Optional[bool] = None + device_stack_protector: Optional[bool] = None + + def __post_init__(self): + self.formatted_options = [] + if self.arch is not None: + self.formatted_options.append(f"-arch={self.arch}") + if self.max_register_count is not None: + self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + if self.time is not None: + self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append("-verbose") + if self.link_time_optimization is not None: + self.formatted_options.append("-lto") + if self.ptx is not None: + self.formatted_options.append("-ptx") + if self.optimization_level is not None: + self.formatted_options.append(f"-O{self.optimization_level}") + if self.debug is not None: + self.formatted_options.append("-g") + if self.lineinfo is not None: + self.formatted_options.append("-lineinfo") + if self.ftz is not None: + self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + if self.prec_div is not None: + self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + if self.prec_sqrt is not None: + self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + if self.fma is not None: + self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(f"-kernels-used={kernel}") + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(f"-variables-used={variable}") + if self.optimize_unused_variables is not None: + self.formatted_options.append("-optimize-unused-variables") + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(f"-Xptxas={opt}") + if self.split_compile is not None: + self.formatted_options.append(f"-split-compile={self.split_compile}") + if self.split_compile_extended is not None: + self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + if self.jump_table_density is not None: + self.formatted_options.append(f"-jump-table-density={self.jump_table_density}") + if self.no_cache is not None: + self.formatted_options.append("-no-cache") + if self.device_stack_protector is not None: + self.formatted_options.append("-device-stack-protector") + + +class Linker: + __slots__ = "_handle" + + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): + self._handle = None + options = check_or_create_options(LinkerOptions, options, "Linker options") + self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + + if object_codes is not None: + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + def _add_code_object(self, object_code: ObjectCode): + data = object_code._module + assert isinstance(data, bytes) + nvjitlink.add_data( + self._handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + + def link(self, target_type) -> ObjectCode: + nvjitlink.complete(self._handle) + if target_type not in ["cubin", "ptx"]: + raise ValueError(f"Unsupported target type: {target_type}") + code = None + if target_type == "cubin": + cubin_size = nvjitlink.get_linked_cubin_size(self._handle) + code = bytearray(cubin_size) + nvjitlink.get_linked_cubin(self._handle, code) + else: + ptx_size = nvjitlink.get_linked_ptx_size(self._handle) + code = bytearray(ptx_size) + nvjitlink.get_linked_ptx(self._handle, code) + + return ObjectCode(bytes(code), target_type) + + def get_error_log(self) -> str: + log_size = nvjitlink.get_error_log_size(self._handle) + log = bytearray(log_size) + nvjitlink.get_error_log(self._handle, log) + return log.decode() + + def get_info_log(self) -> str: + log_size = nvjitlink.get_info_log_size(self._handle) + log = bytearray(log_size) + nvjitlink.get_info_log(self._handle, log) + return log.decode() + + def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: + # this list is based on the supported values for code_type in the ObjectCode class definition. + # nvjitlink supports other options for input type + if code_type == "ptx": + return nvjitlink.InputType.PTX + elif code_type == "cubin": + return nvjitlink.InputType.CUBIN + elif code_type == "fatbin": + return nvjitlink.InputType.FATBIN + elif code_type == "ltoir": + return nvjitlink.InputType.LTOIR + elif code_type == "object": + return nvjitlink.InputType.OBJECT + else: + raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + + @property + def handle(self) -> int: + return self._handle + + def __del__(self): + if self._handle is not None: + nvjitlink.destroy(self._handle) + self._handle = None diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1cb9811b..e10b36a8 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,3 +31,8 @@ CUDA compilation toolchain :toctree: generated/ Program + Linker + + :template: dataclass.rst + + LinkerOptions \ No newline at end of file diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..4c615eb3 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -6,4 +6,5 @@ maxdepth: 3 --- 0.1.0 + 0.2.0 ``` diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 2131ed90..1ebb41f9 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,9 +1,9 @@ # `cuda.core` Release notes -Released on Nov 8, 2024 +Released on Nov XX, 2024 ## Hightlights -- Initial beta release +- Initial EA1 (early access) release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md new file mode 100644 index 00000000..1a047511 --- /dev/null +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -0,0 +1,11 @@ +# `cuda.core` Release notes + +Released on Nov , 2024 + +## Hightlights +- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API. + +## Limitations + +-The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API. + diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py new file mode 100644 index 00000000..6011bf4f --- /dev/null +++ b/cuda_core/tests/test_linker.py @@ -0,0 +1,101 @@ +import pytest + +from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._program import Program + +ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +empty_entrypoint_kernel = "__global__ void A() {}" +empty_kernel = "__device__ void B() {}" +addition_kernel = "__device__ int C(int a, int b) { return a + b; }" + + +@pytest.fixture(scope="module") +def compile_ptx_functions(init_cuda): + object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + + return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx + + +@pytest.fixture(scope="module") +def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + + return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir + + +@pytest.mark.parametrize( + "options", + [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + LinkerOptions(arch=ARCH, jump_table_density=100), + LinkerOptions(arch=ARCH, no_cache=True), + ], +) +def test_linker_init(compile_ptx_functions, options): + linker = Linker(*compile_ptx_functions, options=options) + object_code = linker.link("cubin") + assert isinstance(object_code, ObjectCode) + + +def test_linker_init_invalid_arch(): + options = LinkerOptions(arch=None) + with pytest.raises(TypeError): + Linker(options) + + +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_cubin(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("cubin") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_invalid_target_type(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + with pytest.raises(ValueError): + linker.link("invalid_target") + + +def test_linker_get_error_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_error_log() + assert isinstance(log, str) + + +def test_linker_get_info_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_info_log() + assert isinstance(log, str) From 1b5f01974d92e2fef030ecc9e1da701ae221cd30 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:09:37 -0800 Subject: [PATCH 002/111] fix commit --- cuda_core/cuda/core/experimental/_linker.py | 26 ++++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index e80bfe61..3a47b439 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,3 +1,4 @@ +import weakref from dataclasses import dataclass from typing import List, Optional @@ -152,9 +153,7 @@ class LinkerOptions: xptxas: Optional[List[str]] = None split_compile: Optional[int] = None split_compile_extended: Optional[int] = None - jump_table_density: Optional[int] = None no_cache: Optional[bool] = None - device_stack_protector: Optional[bool] = None def __post_init__(self): self.formatted_options = [] @@ -199,26 +198,25 @@ def __post_init__(self): self.formatted_options.append(f"-split-compile={self.split_compile}") if self.split_compile_extended is not None: self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.jump_table_density is not None: - self.formatted_options.append(f"-jump-table-density={self.jump_table_density}") if self.no_cache is not None: self.formatted_options.append("-no-cache") - if self.device_stack_protector is not None: - self.formatted_options.append("-device-stack-protector") class Linker: - __slots__ = "_handle" + __slots__ = ("__weakref__", "_handle", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - self._handle = None options = check_or_create_options(LinkerOptions, options, "Linker options") self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) - if object_codes is not None: - for code in object_codes: - assert isinstance(code, ObjectCode) - self._add_code_object(code) + if len(object_codes) == 0: + raise ValueError("At least one ObjectCode object must be provided") + + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + weakref.finalize(self, self.close) def _add_code_object(self, object_code: ObjectCode): data = object_code._module @@ -233,7 +231,7 @@ def _add_code_object(self, object_code: ObjectCode): def link(self, target_type) -> ObjectCode: nvjitlink.complete(self._handle) - if target_type not in ["cubin", "ptx"]: + if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") code = None if target_type == "cubin": @@ -279,7 +277,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: def handle(self) -> int: return self._handle - def __del__(self): + def close(self): if self._handle is not None: nvjitlink.destroy(self._handle) self._handle = None From 58ce68f06841ebaae4bb6c4789c68fb8a16ec1e6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:10:42 -0800 Subject: [PATCH 003/111] fix commit --- cuda_core/cuda/core/experimental/_linker.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 3a47b439..518c48d3 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -113,25 +113,11 @@ class LinkerOptions: Default: 1 Acceptable value type: int Maps to: -split-compile-extended= - jump_table_density : int, optional - When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to - determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value - is 101. The percentage ranges from 0 to 101 inclusively. - Default: 101 - Acceptable value type: int - Maps to: -jump-table-density= no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. Default: False Acceptable value type: bool Maps to: -no-cache - device_stack_protector : bool, optional - Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory - safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in - each function. Only those functions which are deemed high-risk make use of a stack canary. - Default: False - Acceptable value type: bool - Maps to: -device-stack-protector """ arch: str From ce8a47233786466d2e4d7335e518e0070dcf86ea Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:12:14 -0800 Subject: [PATCH 004/111] keep self._options for debugging --- cuda_core/cuda/core/experimental/_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 518c48d3..cf4c6ccd 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -192,7 +192,7 @@ class Linker: __slots__ = ("__weakref__", "_handle", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - options = check_or_create_options(LinkerOptions, options, "Linker options") + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) if len(object_codes) == 0: From ab35b373ddda7b4177853d1c348a3b6027fb391f Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:13:41 -0800 Subject: [PATCH 005/111] revert release notes change --- cuda_core/docs/source/release/0.1.0-notes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md index 1ebb41f9..2131ed90 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.md @@ -1,9 +1,9 @@ # `cuda.core` Release notes -Released on Nov XX, 2024 +Released on Nov 8, 2024 ## Hightlights -- Initial EA1 (early access) release +- Initial beta release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits From b82591fc70adb26023ddaf1ddc0fb2e5c4881b4c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:14:31 -0800 Subject: [PATCH 006/111] update linker test --- cuda_core/tests/test_linker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 6011bf4f..2dfac375 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,8 +1,7 @@ import pytest -from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental import Linker, LinkerOptions, Program from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._program import Program ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper empty_entrypoint_kernel = "__global__ void A() {}" From 265ba01c7ef586177afb877e0f2bbea42c80528d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 13:26:20 -0800 Subject: [PATCH 007/111] update the test --- cuda_core/tests/test_linker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 2dfac375..7db6ed9f 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -9,7 +9,7 @@ addition_kernel = "__device__ int C(int a, int b) { return a + b; }" -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") @@ -18,7 +18,7 @@ def compile_ptx_functions(init_cuda): return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) @@ -47,7 +47,6 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, xptxas=["-v"]), LinkerOptions(arch=ARCH, split_compile=0), LinkerOptions(arch=ARCH, split_compile_extended=1), - LinkerOptions(arch=ARCH, jump_table_density=100), LinkerOptions(arch=ARCH, no_cache=True), ], ) From 7d5c742d1d9a2ac1676811002b1d511ff655dc57 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 27 Nov 2024 16:39:48 -0800 Subject: [PATCH 008/111] add the system class --- cuda_core/cuda/core/experimental/__init__.py | 1 + cuda_core/cuda/core/experimental/_system.py | 66 ++++++++++++++++++++ cuda_core/docs/source/api.rst | 1 + cuda_core/docs/source/api_private.rst | 1 + cuda_core/docs/source/release.md | 2 + cuda_core/docs/source/release/0.2.0-notes.md | 10 +++ cuda_core/tests/test_system.py | 37 +++++++++++ 7 files changed, 118 insertions(+) create mode 100644 cuda_core/cuda/core/experimental/_system.py create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md create mode 100644 cuda_core/tests/test_system.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9b978398..25f5f82c 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -7,3 +7,4 @@ from cuda.core.experimental._launcher import LaunchConfig, launch from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions +from cuda.core.experimental._system import system diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py new file mode 100644 index 00000000..58fbd6ae --- /dev/null +++ b/cuda_core/cuda/core/experimental/_system.py @@ -0,0 +1,66 @@ +from typing import Tuple +from cuda import cuda, cudart +from cuda.core.experimental._device import Device +from cuda.core.experimental._utils import handle_return + +class System: + """ Provide information about the cuda system. + This class is a singleton and should not be instantiated directly. + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super(System, cls).__new__(cls) + return cls._instance + + def __init__(self): + if hasattr(self, '_initialized') and self._initialized: + return + self._initialized = True + + @property + def driver_version(self) -> Tuple[int, int]: + """ + Query the CUDA driver version. + + Returns + ------- + tuple of int + A 2-tuple of (major, minor) version numbers. + """ + version = handle_return(cuda.cuDriverGetVersion()) + major = version // 1000 + minor = (version % 1000) // 10 + return (major, minor) + + @property + def num_devices(self) -> int: + """ + Query the number of available GPUs. + + Returns + ------- + int + The number of available GPU devices. + """ + return handle_return(cudart.cudaGetDeviceCount()) + + @property + def devices(self) -> tuple: + """ + Query the available device instances. + + Returns + ------- + tuple of Device + A tuple containing instances of available devices. + """ + total = self.num_devices + return tuple(Device(device_id) for device_id in range(total)) + +system = System() +system.__doc__ = """ +Singleton instance of the :obj:`~cuda.core.experimental._system.System` class. +""" diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1cb9811b..3d2a8481 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -16,6 +16,7 @@ CUDA runtime Device launch + system :template: dataclass.rst diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index f100eb7c..a833d69c 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -16,6 +16,7 @@ CUDA runtime _memory.Buffer _stream.Stream _event.Event + _system.System CUDA compilation toolchain diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..5cbaa7f2 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -6,4 +6,6 @@ maxdepth: 3 --- 0.1.0 + 0.2.0 + ``` diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md new file mode 100644 index 00000000..e1a3c4ec --- /dev/null +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -0,0 +1,10 @@ +# `cuda.core` Release notes + +Released on , 2024 + +## Hightlights +- Addition of the system singleton + +## Limitations + + diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py new file mode 100644 index 00000000..548e8685 --- /dev/null +++ b/cuda_core/tests/test_system.py @@ -0,0 +1,37 @@ +# test_System.py + +try: + from cuda.bindings import driver, runtime +except ImportError: + from cuda import cuda as driver + from cuda import cudart as runtime + +from cuda.core.experimental import Device, System + +from cuda.core.experimental import Device +from cuda.core.experimental._utils import handle_return + +def test_System_singleton(): + System1 = System + System2 = System + assert System1 is System2, "System is not a singleton" + +def test_driver_version(): + driver_version = System.driver_version + print(driver_version) + version = handle_return(driver.cuDriverGetVersion()) + expected_driver_version = (version // 1000, (version % 1000) // 10) + assert driver_version == expected_driver_version, "Driver version does not match expected value" + +def test_num_devices(): + num_devices = System.num_devices + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + assert num_devices == expected_num_devices, "Number of devices does not match expected value" + +def test_devices(): + devices = System.devices + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) + assert len(devices) == len(expected_devices), "Number of devices does not match expected value" + for device, expected_device in zip(devices, expected_devices): + assert device.device_id == expected_device.device_id, "Device ID does not match expected value" From 4c4acef6f840ebce13dcf41317f447d448420ae6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 28 Nov 2024 16:48:59 -0800 Subject: [PATCH 009/111] fix old test change --- cuda_core/tests/test_system.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index 548e8685..400d5e22 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -1,35 +1,33 @@ -# test_System.py - try: from cuda.bindings import driver, runtime except ImportError: from cuda import cuda as driver from cuda import cudart as runtime -from cuda.core.experimental import Device, System +from cuda.core.experimental import Device, system from cuda.core.experimental import Device from cuda.core.experimental._utils import handle_return -def test_System_singleton(): - System1 = System - System2 = System - assert System1 is System2, "System is not a singleton" +def test_system_singleton(): + system1 = system + system2 = system + assert system1 is system2, "system is not a singleton" def test_driver_version(): - driver_version = System.driver_version + driver_version = system.driver_version print(driver_version) version = handle_return(driver.cuDriverGetVersion()) expected_driver_version = (version // 1000, (version % 1000) // 10) assert driver_version == expected_driver_version, "Driver version does not match expected value" def test_num_devices(): - num_devices = System.num_devices + num_devices = system.num_devices expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) assert num_devices == expected_num_devices, "Number of devices does not match expected value" def test_devices(): - devices = System.devices + devices = system.devices expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) assert len(devices) == len(expected_devices), "Number of devices does not match expected value" From 36f045c6a1a834fc28f8652d348ac281ca827a15 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 28 Nov 2024 16:50:18 -0800 Subject: [PATCH 010/111] run ruff manually --- cuda_core/cuda/core/experimental/_system.py | 4 +++- cuda_core/tests/test_system.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py index 58fbd6ae..c1ce9402 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/experimental/_system.py @@ -1,8 +1,10 @@ from typing import Tuple + from cuda import cuda, cudart from cuda.core.experimental._device import Device from cuda.core.experimental._utils import handle_return + class System: """ Provide information about the cuda system. This class is a singleton and should not be instantiated directly. @@ -12,7 +14,7 @@ class System: def __new__(cls): if cls._instance is None: - cls._instance = super(System, cls).__new__(cls) + cls._instance = super().__new__(cls) return cls._instance def __init__(self): diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index 400d5e22..a093dc94 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -5,10 +5,9 @@ from cuda import cudart as runtime from cuda.core.experimental import Device, system - -from cuda.core.experimental import Device from cuda.core.experimental._utils import handle_return + def test_system_singleton(): system1 = system system2 = system From 319a372b75b0530e7f4600bbdc34197db2bf420c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:17:00 -0800 Subject: [PATCH 011/111] merge with main for ruff --- cuda_core/cuda/core/experimental/_device.py | 12 +++++-- cuda_core/cuda/core/experimental/_memory.py | 37 ++++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0c03c789..a5cd4bc7 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -7,7 +7,7 @@ from cuda import cuda, cudart from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool +from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool from cuda.core.experimental._stream import Stream, StreamOptions, default_stream from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition @@ -62,7 +62,13 @@ def __new__(cls, device_id=None): for dev_id in range(total): dev = super().__new__(cls) dev._id = dev_id - dev._mr = _DefaultAsyncMempool(dev_id) + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the AsyncMemoryResource which does not use memory pools. + if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: + dev._mr = _AsyncMemoryResource(dev_id) + else: + dev._mr = _DefaultAsyncMempool(dev_id) + dev._has_inited = False _tls.devices.append(dev) @@ -70,7 +76,7 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: - raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?") + raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?") @property def device_id(self) -> int: diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 415b5151..50f8a260 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -42,7 +42,11 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ("_ptr", "_size", "_mr") + __slots__ = ( + "_ptr", + "_size", + "_mr", + ) def __init__(self, ptr, size, mr: MemoryResource = None): self._ptr = ptr @@ -286,3 +290,34 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +class _AsyncMemoryResource(MemoryResource): + __slots__ = ("_dev_id",) + + def __init__(self, dev_id): + self._handle = None + self._dev_id = dev_id + + def allocate(self, size, stream=None) -> Buffer: + if stream is None: + stream = default_stream() + ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle)) + return Buffer(ptr, size, self) + + def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return self._dev_id From 19e3a4f4b54a4b9562742d8575ad1f8ca7e6e0a7 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:18:04 -0800 Subject: [PATCH 012/111] fix tuple reformat --- cuda_core/cuda/core/experimental/_memory.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 50f8a260..26a9dd82 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -42,11 +42,7 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ( - "_ptr", - "_size", - "_mr", - ) + __slots__ = ("_ptr", "_size", "_mr") def __init__(self, ptr, size, mr: MemoryResource = None): self._ptr = ptr From 5e84da7cf888214ba940176a28089467f2afb055 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 29 Nov 2024 11:18:45 -0800 Subject: [PATCH 013/111] fix tuple reformat --- cuda_core/cuda/core/experimental/_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index a5cd4bc7..a15eef36 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -76,7 +76,7 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: - raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?") + raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?") @property def device_id(self) -> int: From 122d25c01f4b8bbc02239bb1c2e58005c4bdb506 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 09:25:39 -0800 Subject: [PATCH 014/111] switch to sync alloc and free --- cuda_core/cuda/core/experimental/_device.py | 6 +++--- cuda_core/cuda/core/experimental/_memory.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index a15eef36..889c20a0 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -7,7 +7,7 @@ from cuda import cuda, cudart from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool +from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource from cuda.core.experimental._stream import Stream, StreamOptions, default_stream from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition @@ -63,9 +63,9 @@ def __new__(cls, device_id=None): dev = super().__new__(cls) dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the AsyncMemoryResource which does not use memory pools. + # use the SynchronousMemoryResource which does not use memory pools. if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: - dev._mr = _AsyncMemoryResource(dev_id) + dev._mr = _SynchronousMemoryResource(dev_id) else: dev._mr = _DefaultAsyncMempool(dev_id) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 26a9dd82..16dd97d7 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -288,7 +288,7 @@ def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") -class _AsyncMemoryResource(MemoryResource): +class _SynchronousMemoryResource(MemoryResource): __slots__ = ("_dev_id",) def __init__(self, dev_id): @@ -298,13 +298,13 @@ def __init__(self, dev_id): def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() - ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle)) + ptr = handle_return(cuda.cuMemAlloc(size, stream._handle)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): if stream is None: stream = default_stream() - handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + handle_return(cuda.cuMemFree(ptr, stream._handle)) @property def is_device_accessible(self) -> bool: From 5f8ff802ee9efba50492870410d14d8633471cde Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 2 Dec 2024 09:28:56 -0800 Subject: [PATCH 015/111] Update cuda_core/docs/source/release/0.2.0-notes.md Co-authored-by: Leo Fang --- cuda_core/docs/source/release/0.2.0-notes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md index e1a3c4ec..57a3254d 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.2.0-notes.md @@ -3,7 +3,7 @@ Released on , 2024 ## Hightlights -- Addition of the system singleton +- Add a `cuda.core.experimental.system` module for querying system- or process- wide information. ## Limitations From d1d6928d6be107087f534a7dc37bf9c8dbdc9463 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 2 Dec 2024 09:44:28 -0800 Subject: [PATCH 016/111] Update cuda_core/docs/source/release.md Co-authored-by: Leo Fang --- cuda_core/docs/source/release.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 5cbaa7f2..8c810273 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -6,6 +6,6 @@ maxdepth: 3 --- 0.1.0 - 0.2.0 + 0.1.1 ``` From cfa9d167bcd39504fa8a6963f1fafd836cda2623 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 09:56:23 -0800 Subject: [PATCH 017/111] address comments --- cuda_core/docs/source/release/0.2.0-notes.md | 10 ---------- cuda_core/tests/test_system.py | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) delete mode 100644 cuda_core/docs/source/release/0.2.0-notes.md diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md deleted file mode 100644 index e1a3c4ec..00000000 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ /dev/null @@ -1,10 +0,0 @@ -# `cuda.core` Release notes - -Released on , 2024 - -## Hightlights -- Addition of the system singleton - -## Limitations - - diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index a093dc94..893d1206 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -11,7 +11,7 @@ def test_system_singleton(): system1 = system system2 = system - assert system1 is system2, "system is not a singleton" + assert id(system1) == id(system2), "system is not a singleton" def test_driver_version(): driver_version = system.driver_version From 8e43cd26b30d0b34526260c5cd60bdadeecb3e4d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 09:57:16 -0800 Subject: [PATCH 018/111] rename release file --- cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md new file mode 100644 index 00000000..404ecb85 --- /dev/null +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -0,0 +1,7 @@ +# `cuda.core` Release notes + +Released on , 2024 + +## Hightlights +- Add a `cuda.core.experimental.system` module for querying system- or process- wide information. + From bff2627fa70a446c337fc987d8165f78987feae9 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 10:33:30 -0800 Subject: [PATCH 019/111] update link style to match other PR --- cuda_core/cuda/core/experimental/_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py index c1ce9402..2cecbd98 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/experimental/_system.py @@ -64,5 +64,5 @@ def devices(self) -> tuple: system = System() system.__doc__ = """ -Singleton instance of the :obj:`~cuda.core.experimental._system.System` class. +Singleton instance of the :obj:`_system.System` class. """ From c8a8dcb0a682ab754e1d036c68dc312a0b97608d Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 13:24:50 -0800 Subject: [PATCH 020/111] save --- cuda_core/cuda/core/experimental/_linker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index cf4c6ccd..d7dd273c 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import weakref from dataclasses import dataclass from typing import List, Optional From 5e3bdcd97e6ccc13513f15849f106addab9fb72a Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 13:26:15 -0800 Subject: [PATCH 021/111] add copyright header --- cuda_core/cuda/core/experimental/_system.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py index 2cecbd98..258f9bcd 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/experimental/_system.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from typing import Tuple from cuda import cuda, cudart From e9661895fff2a5d928be73b521aee30e4960935e Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 2 Dec 2024 13:33:31 -0800 Subject: [PATCH 022/111] add docstring, copyright header, and switch finalizer pattern --- cuda_core/cuda/core/experimental/_linker.py | 68 ++++++++++++++++----- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index d7dd273c..1a99f355 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -193,11 +193,49 @@ def __post_init__(self): class Linker: - __slots__ = ("__weakref__", "_handle", "_options") + """ + Linker class for managing the linking of object codes with specified options. + + Parameters + ---------- + object_codes : ObjectCode + One or more ObjectCode objects to be linked. + options : LinkerOptions, optional + Options for the linker. If not provided, default options will be used. + + Attributes + ---------- + _options : LinkerOptions + The options used for the linker. + _handle : handle + The handle to the linker created by nvjitlink. + + Methods + ------- + _add_code_object(object_code) + Adds an object code to the linker. + close() + Closes the linker and releases resources. + """ + + class _MembersNeededForFinalize: + __slots__ = ("handle",) + + def __init__(self, program_obj, handle): + self.handle = handle + weakref.finalize(program_obj, self.close) + + def close(self): + if self.handle is not None: + nvjitlink.destroy(self.handle) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) + self._mnff = Linker._MembersNeededForFinalize(self, None) if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") @@ -212,7 +250,7 @@ def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) nvjitlink.add_data( - self._handle, + self._mnff.handle, self._input_type_from_code_type(object_code._code_type), data, len(data), @@ -220,31 +258,31 @@ def _add_code_object(self, object_code: ObjectCode): ) def link(self, target_type) -> ObjectCode: - nvjitlink.complete(self._handle) + nvjitlink.complete(self._mnff.handle) if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") code = None if target_type == "cubin": - cubin_size = nvjitlink.get_linked_cubin_size(self._handle) + cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle) code = bytearray(cubin_size) - nvjitlink.get_linked_cubin(self._handle, code) + nvjitlink.get_linked_cubin(self._mnff.handle, code) else: - ptx_size = nvjitlink.get_linked_ptx_size(self._handle) + ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle) code = bytearray(ptx_size) - nvjitlink.get_linked_ptx(self._handle, code) + nvjitlink.get_linked_ptx(self._mnff.handle, code) return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: - log_size = nvjitlink.get_error_log_size(self._handle) + log_size = nvjitlink.get_error_log_size(self._mnff.handle) log = bytearray(log_size) - nvjitlink.get_error_log(self._handle, log) + nvjitlink.get_error_log(self._mnff.handle, log) return log.decode() def get_info_log(self) -> str: - log_size = nvjitlink.get_info_log_size(self._handle) + log_size = nvjitlink.get_info_log_size(self._mnff.handle) log = bytearray(log_size) - nvjitlink.get_info_log(self._handle, log) + nvjitlink.get_info_log(self._mnff.handle, log) return log.decode() def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: @@ -265,9 +303,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: @property def handle(self) -> int: - return self._handle + return self._mnff.handle def close(self): - if self._handle is not None: - nvjitlink.destroy(self._handle) - self._handle = None + self._mnff.close() From c626b956bc7ca1cc963b89bceafc3dfc3b0f84aa Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:03:30 +0530 Subject: [PATCH 023/111] Adding support for CI testing. --- .github/actions/build/action.yml | 2 +- .github/actions/test/action.yml | 37 ++++++++++ .github/workflows/gh-build-and-test.yml | 23 ++++++- .github/workflows/gh-test.yml | 80 ++++++++++++++++++++++ continuous_integration/scripts/setup-utils | 23 +++++++ continuous_integration/scripts/test | 33 +++++++++ 6 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 .github/actions/test/action.yml create mode 100644 .github/workflows/gh-test.yml create mode 100755 continuous_integration/scripts/test diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 952fb9cd..b6741343 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -54,7 +54,7 @@ runs: --rm "${{ inputs.docker-image }}" \ /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - - if: ${{ !inputs.use-container }} + - if: ${{ !inputs.use-container && steps.cache-build.outputs.cache-hit != 'true'}} name: Build (without container) shell: bash --noprofile --norc -xeuo pipefail {0} run: | diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml new file mode 100644 index 00000000..a11a9938 --- /dev/null +++ b/.github/actions/test/action.yml @@ -0,0 +1,37 @@ +name: test + +description: Run tests in specified project + +inputs: + test-options: + required: true + type: string + runner-has-gpu: + required: true + type: boolean + description: "The runner has GPU(s)." + +runs: + using: composite + steps: + - if: ${{ inputs.runner-has-gpu == true }} + name: Run nvidia-smi to make sure GPU is working + shell: bash --noprofile --norc -xeuo pipefail {0} + run: nvidia-smi + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: ${{ env.ARTIFACT_NAME }} + path: ${{ env.ARTIFACTS_DIR }} + + - name: Display structure of downloaded artifacts + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pwd + ls -lahR $ARTIFACTS_DIR + + - name: Run test / analysis + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/test" ${{ inputs.test-options }} diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 430fbf5b..1df308ab 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -27,9 +27,28 @@ jobs: with: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu16') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + build-type: ${{ inputs.build-type }} + use-container: false + host-platform: ${{ inputs.host-platform }} + dependencies-file: "" + build-mode: ${{ inputs.build-mode }} + upload-enabled: ${{ inputs.upload-enabled }} + secrets: inherit + + test: + if: ${{ github.repository_owner == 'nvidia' }} + needs: + - build + uses: + ./.github/workflows/gh-test.yml + with: + client-repo: ${{ github.event.repository.name }} + target-device: ${{ inputs.target-device }} + test-options: ${{ inputs.build-type }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} build-type: ${{ inputs.build-type }} - use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}} host-platform: ${{ inputs.host-platform }} dependencies-file: "" build-mode: ${{ inputs.build-mode }} diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml new file mode 100644 index 00000000..74f1c520 --- /dev/null +++ b/.github/workflows/gh-test.yml @@ -0,0 +1,80 @@ +name: Test + +on: + workflow_call: + inputs: + client-repo: + required: true + type: string + target-device: + required: true + type: string + test-options: + required: true + type: string + runs-on: + required: true + type: string + runner-has-gpu: + required: true + type: boolean + description: "The runner has GPU(s)." + build-type: + required: true + type: string + description: One of ci / release + host-platform: + required: true + type: string + dependencies-file: + required: true + type: string + description: path to versions.json relative to the target repo dir + build-mode: + required: true + type: string + upload-enabled: + required: true + type: boolean + python-version: + required: false + type: string + +jobs: + build: + name: Test (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} ) + + permissions: + id-token: write # This is required for configure-aws-credentials + contents: read # This is required for actions/checkout + + runs-on: ${{ inputs.runs-on }} + + container: + options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g + image: condaforge/miniforge3:latest + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + + steps: + - name: Checkout ${{ inputs.client-repo }} + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup + uses: ./.github/actions/setup + with: + client-repo: ${{ inputs.client-repo }} + build-type: ${{ inputs.build-type }} + target-device: "${{ inputs.target-device }}" + host-platform: ${{ inputs.host-platform }} + build-mode: ${{ inputs.build-mode }} + upload-enabled: ${{ inputs.upload-enabled }} + python-version: ${{ inputs.python-version }} + + - name: Call test action + uses: ./.github/actions/test + with: + test-options: ${{ inputs.test-options }} + runner-has-gpu: ${{ inputs.runner-has-gpu }} diff --git a/continuous_integration/scripts/setup-utils b/continuous_integration/scripts/setup-utils index 62579e63..f8faefa4 100755 --- a/continuous_integration/scripts/setup-utils +++ b/continuous_integration/scripts/setup-utils @@ -151,6 +151,29 @@ init_build_env() { make-conda-env "$BUILD_TYPE"; + activate_conda_env; + conda_info; +} + +init_test_env() { + set -x; + + . conda-utils; + + export TEST_TYPE=$1 + + set -xeuo pipefail; + + set_base_defs; + + cd "$PREBUILD_DIR" + + # setup_test_env; + + cd "$REPO_DIR"; + + make-conda-env "$TEST_TYPE"; + activate_conda_env; conda_info; } \ No newline at end of file diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test new file mode 100755 index 00000000..e8c56c52 --- /dev/null +++ b/continuous_integration/scripts/test @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +test_ci() { + set -xeou pipefail + + cd "${ARTIFACTS_DIR}" + + activate_conda_env; + + pip install *.whl + + cd "${REPO_DIR}" + + python -m pytest +} + +test_project() { + set -xeou pipefail + + export PYTHONUNBUFFERED=1 + + . setup-utils; + init_test_env "$@"; + + git config --global --add safe.directory "$REPO_DIR/.git" + + case "${TEST_TYPE}" in + ci) test_ci;; + *) return 1;; + esac +} + +(test_project "$@"); From 5467b5284c3467f8f3a41570a7df108049389f42 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Wed, 27 Nov 2024 04:18:20 +0530 Subject: [PATCH 024/111] Supply python-version. --- .github/workflows/gh-build-and-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 1df308ab..adf8477a 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -34,6 +34,7 @@ jobs: dependencies-file: "" build-mode: ${{ inputs.build-mode }} upload-enabled: ${{ inputs.upload-enabled }} + python-version: ${{ inputs.python-version }} secrets: inherit test: From c78ebfdcaa92ea40ade3014a3da952e5a11dc8e6 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Wed, 27 Nov 2024 04:36:17 +0530 Subject: [PATCH 025/111] Update test driver to test bindings and core separately. --- .github/actions/test/action.yml | 22 +++++++++++++++++----- continuous_integration/scripts/test | 13 +++++++++---- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index a11a9938..018db9aa 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -19,17 +19,29 @@ runs: shell: bash --noprofile --norc -xeuo pipefail {0} run: nvidia-smi - - name: Download build artifacts + - name: Download bindings build artifacts uses: actions/download-artifact@v4 with: - name: ${{ env.ARTIFACT_NAME }} - path: ${{ env.ARTIFACTS_DIR }} + name: ${{ env.BINDINGS_ARTIFACT_NAME }} + path: ${{ env.BINDINGS_ARTIFACTS_DIR }} - - name: Display structure of downloaded artifacts + - name: Display structure of downloaded bindings artifacts shell: bash --noprofile --norc -xeuo pipefail {0} run: | pwd - ls -lahR $ARTIFACTS_DIR + ls -lahR $BINDINGS_ARTIFACTS_DIR + + - name: Download core build artifacts + uses: actions/download-artifact@v4 + with: + name: ${{ env.CORE_ARTIFACT_NAME }} + path: ${{ env.CORE_ARTIFACTS_DIR }} + + - name: Display structure of downloaded core build artifacts + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pwd + ls -lahR $CORE_ARTIFACTS_DIR - name: Run test / analysis shell: bash --noprofile --norc -xeuo pipefail {0} diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test index e8c56c52..96bdf8d5 100755 --- a/continuous_integration/scripts/test +++ b/continuous_integration/scripts/test @@ -3,15 +3,20 @@ test_ci() { set -xeou pipefail - cd "${ARTIFACTS_DIR}" - activate_conda_env; + cd "${BINDINGS_ARTIFACTS_DIR}" + pip install *.whl + + cd "${CORE_ARTIFACTS_DIR}" pip install *.whl - cd "${REPO_DIR}" + cd "${REPO_DIR}/cuda_python/cuda_bindings" + python -m pytest tests/ + + cd "${REPO_DIR}/cuda_python/cuda_core" + python -m pytest tests/ - python -m pytest } test_project() { From e5bf104ddf6ec94ec36ed74b68d148003fe8b6da Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:03:30 +0530 Subject: [PATCH 026/111] Adding support for CI testing. --- .github/workflows/gh-build-and-test.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index adf8477a..acf7e509 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -56,3 +56,21 @@ jobs: upload-enabled: ${{ inputs.upload-enabled }} python-version: ${{ inputs.python-version }} secrets: inherit + + test: + if: ${{ github.repository_owner == 'nvidia' }} + uses: + ./.github/workflows/gh-build.yml + with: + client-repo: ${{ github.event.repository.name }} + target-device: ${{ inputs.target-device }} + test-options: ${{ inputs.build-type }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} + build-type: ${{ inputs.build-type }} + use-container: false + host-platform: ${{ inputs.host-platform }} + dependencies-file: "" + build-mode: ${{ inputs.build-mode }} + upload-enabled: ${{ inputs.upload-enabled }} + secrets: inherit From 360e1b2d23f064eec19e3cd0c87d5bd823a41901 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:05:07 +0530 Subject: [PATCH 027/111] Adding support for CI testing. Attempt 2. --- .github/workflows/gh-build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index acf7e509..1d9bb4ea 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -60,7 +60,7 @@ jobs: test: if: ${{ github.repository_owner == 'nvidia' }} uses: - ./.github/workflows/gh-build.yml + ./.github/workflows/gh-test.yml with: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} From 67b7aed7ad1efbbbf017c221a5ef8223bed0c032 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:06:10 +0530 Subject: [PATCH 028/111] Adding support for CI testing. Attempt 3. --- .github/workflows/gh-build-and-test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 1d9bb4ea..65a4a72a 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -59,6 +59,8 @@ jobs: test: if: ${{ github.repository_owner == 'nvidia' }} + needs: + - build uses: ./.github/workflows/gh-test.yml with: From 6fab977584c4f4a5a5cf2f1f1cef3719fe8ed4d5 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:15:07 +0530 Subject: [PATCH 029/111] Use container for tests on the GPU runner. --- .github/workflows/gh-build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 65a4a72a..167a5546 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -70,7 +70,7 @@ jobs: runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} build-type: ${{ inputs.build-type }} - use-container: false + use-container: ${{ inputs.host-platform == 'linux-x64' }} host-platform: ${{ inputs.host-platform }} dependencies-file: "" build-mode: ${{ inputs.build-mode }} From f2a0939aadf87d92c59e84584ded24f6a77b077a Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Mon, 23 Sep 2024 18:27:47 +0530 Subject: [PATCH 030/111] Use container for tests on the GPU runner. Attempt 2. --- .github/workflows/gh-build-and-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 167a5546..185ede7a 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -70,7 +70,6 @@ jobs: runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} build-type: ${{ inputs.build-type }} - use-container: ${{ inputs.host-platform == 'linux-x64' }} host-platform: ${{ inputs.host-platform }} dependencies-file: "" build-mode: ${{ inputs.build-mode }} From 508a83c072b2ec46750d174926d32684a3207092 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 15 Nov 2024 20:23:05 +0530 Subject: [PATCH 031/111] Remove build caching. --- .github/actions/build/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index b6741343..952fb9cd 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -54,7 +54,7 @@ runs: --rm "${{ inputs.docker-image }}" \ /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - - if: ${{ !inputs.use-container && steps.cache-build.outputs.cache-hit != 'true'}} + - if: ${{ !inputs.use-container }} name: Build (without container) shell: bash --noprofile --norc -xeuo pipefail {0} run: | From 72062aa1ef77f6c76e6f6be8ac1bfa480d9abe4b Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 15 Nov 2024 20:32:58 +0530 Subject: [PATCH 032/111] Hard select Build (without container). --- .github/actions/build/action.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 952fb9cd..583f5775 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -54,10 +54,11 @@ runs: --rm "${{ inputs.docker-image }}" \ /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - - if: ${{ !inputs.use-container }} - name: Build (without container) + #- if: ${{ inputs.use-container == false }} + - name: Build (without container) shell: bash --noprofile --norc -xeuo pipefail {0} run: | + echo "inputs.use-container=${{ inputs.use-container }}" "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/build" "${{ inputs.build-type}}" "${{ inputs.target-device }}" - name: Display structure of the bindings artifacts folder (post build) From 32ca908e36e261008ad8cec93b094e0db3b5a8cd Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 15 Nov 2024 20:41:03 +0530 Subject: [PATCH 033/111] Use container with preinstalled conda for build. --- .github/workflows/gh-build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 185ede7a..e6e40624 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -29,7 +29,7 @@ jobs: target-device: ${{ inputs.target-device }} runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} build-type: ${{ inputs.build-type }} - use-container: false + use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}} host-platform: ${{ inputs.host-platform }} dependencies-file: "" build-mode: ${{ inputs.build-mode }} From 970a8e5c4b43be8c6331904993674487b039c3eb Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 15 Nov 2024 20:42:04 +0530 Subject: [PATCH 034/111] Use container with preinstalled conda for build. Attempt 2. --- .github/actions/build/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 583f5775..7a09ed14 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -54,8 +54,8 @@ runs: --rm "${{ inputs.docker-image }}" \ /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - #- if: ${{ inputs.use-container == false }} - - name: Build (without container) + - if: ${{ inputs.use-container == false }} + name: Build (without container) shell: bash --noprofile --norc -xeuo pipefail {0} run: | echo "inputs.use-container=${{ inputs.use-container }}" From be969e595ad6b66d16f9d32e96f6df550890bd70 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 15 Nov 2024 20:44:07 +0530 Subject: [PATCH 035/111] Use container with preinstalled conda for build. Attempt 3. --- .github/actions/build/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 7a09ed14..e5f67202 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -54,7 +54,7 @@ runs: --rm "${{ inputs.docker-image }}" \ /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - - if: ${{ inputs.use-container == false }} + - if: ${{ !inputs.use-container }} name: Build (without container) shell: bash --noprofile --norc -xeuo pipefail {0} run: | From 3382d68b42b17b39535dcbe3b5e58e54b4695f11 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 29 Nov 2024 09:12:32 +0530 Subject: [PATCH 036/111] Updated paths. --- continuous_integration/scripts/test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test index 96bdf8d5..cbee6998 100755 --- a/continuous_integration/scripts/test +++ b/continuous_integration/scripts/test @@ -11,10 +11,10 @@ test_ci() { cd "${CORE_ARTIFACTS_DIR}" pip install *.whl - cd "${REPO_DIR}/cuda_python/cuda_bindings" + cd "${REPO_DIR}/cuda_bindings" python -m pytest tests/ - cd "${REPO_DIR}/cuda_python/cuda_core" + cd "${REPO_DIR}/cuda_core" python -m pytest tests/ } From a9ed0c6a038bbcb605f5016df584aff081d378cf Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Fri, 29 Nov 2024 09:16:00 +0530 Subject: [PATCH 037/111] Removed duplicate tests section. --- .github/workflows/gh-build-and-test.yml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index e6e40624..9b414a22 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -56,22 +56,3 @@ jobs: upload-enabled: ${{ inputs.upload-enabled }} python-version: ${{ inputs.python-version }} secrets: inherit - - test: - if: ${{ github.repository_owner == 'nvidia' }} - needs: - - build - uses: - ./.github/workflows/gh-test.yml - with: - client-repo: ${{ github.event.repository.name }} - target-device: ${{ inputs.target-device }} - test-options: ${{ inputs.build-type }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} - runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} - build-type: ${{ inputs.build-type }} - host-platform: ${{ inputs.host-platform }} - dependencies-file: "" - build-mode: ${{ inputs.build-mode }} - upload-enabled: ${{ inputs.upload-enabled }} - secrets: inherit From 17c3e106ef82d8a5dcc8bae0c2c8ea484ccc2dda Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:15:46 -0800 Subject: [PATCH 038/111] address comments --- cuda_core/cuda/core/experimental/_linker.py | 52 +++++++++++---------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 1a99f355..bb66adde 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -234,8 +234,9 @@ def close(self): def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) - self._mnff = Linker._MembersNeededForFinalize(self, None) + self._mnff = Linker._MembersNeededForFinalize( + self, nvjitlink.create(len(options.formatted_options), options.formatted_options) + ) if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") @@ -244,8 +245,6 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): assert isinstance(code, ObjectCode) self._add_code_object(code) - weakref.finalize(self, self.close) - def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -257,19 +256,21 @@ def _add_code_object(self, object_code: ObjectCode): f"{object_code._handle}_{object_code._code_type}", ) + _get_linked_methods = { + "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin), + "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx), + } + def link(self, target_type) -> ObjectCode: nvjitlink.complete(self._mnff.handle) - if target_type not in ("cubin", "ptx"): + get_linked = self._get_linked_methods.get(target_type) + if get_linked is None: raise ValueError(f"Unsupported target type: {target_type}") - code = None - if target_type == "cubin": - cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle) - code = bytearray(cubin_size) - nvjitlink.get_linked_cubin(self._mnff.handle, code) - else: - ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle) - code = bytearray(ptx_size) - nvjitlink.get_linked_ptx(self._mnff.handle, code) + + get_size, get_code = get_linked + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) return ObjectCode(bytes(code), target_type) @@ -285,21 +286,22 @@ def get_info_log(self) -> str: nvjitlink.get_info_log(self._mnff.handle, log) return log.decode() + _input_types = { + "ptx": nvjitlink.InputType.PTX, + "cubin": nvjitlink.InputType.CUBIN, + "fatbin": nvjitlink.InputType.FATBIN, + "ltoir": nvjitlink.InputType.LTOIR, + "object": nvjitlink.InputType.OBJECT, + } + def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: # this list is based on the supported values for code_type in the ObjectCode class definition. # nvjitlink supports other options for input type - if code_type == "ptx": - return nvjitlink.InputType.PTX - elif code_type == "cubin": - return nvjitlink.InputType.CUBIN - elif code_type == "fatbin": - return nvjitlink.InputType.FATBIN - elif code_type == "ltoir": - return nvjitlink.InputType.LTOIR - elif code_type == "object": - return nvjitlink.InputType.OBJECT - else: + input_type = self._input_types.get(code_type) + + if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + return input_type @property def handle(self) -> int: From 7f846263d9feffe601948eb0b82b3668b6855713 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:18:44 -0800 Subject: [PATCH 039/111] rename release notes --- cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} | 1 - 1 file changed, 1 deletion(-) rename cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} (93%) diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md similarity index 93% rename from cuda_core/docs/source/release/0.2.0-notes.md rename to cuda_core/docs/source/release/0.1.1-notes.md index 1a047511..0dbd49ce 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -8,4 +8,3 @@ Released on Nov , 2024 ## Limitations -The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API. - From 5207558076d366abf483e72daedc7fd6dce378e6 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 09:42:12 -0800 Subject: [PATCH 040/111] rename release notes --- cuda_core/docs/source/release.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 4c615eb3..55090b0b 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -5,6 +5,6 @@ maxdepth: 3 --- + 0.1.1 0.1.0 - 0.2.0 ``` From 14b9c6766160bcb23227bc303117d9137f8569e0 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 10:53:20 -0800 Subject: [PATCH 041/111] fix the test to not use a global function, which was causing swallowed link errors --- cuda_core/tests/test_linker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 7db6ed9f..1cb444fb 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,15 +4,15 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_entrypoint_kernel = "__global__ void A() {}" -empty_kernel = "__device__ void B() {}" +empty_kernel = "__device__ void A() {}" +basic_kernel = "__device__ int B() { return 0; }" addition_kernel = "__device__ int C(int a, int b) { return a + b; }" @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @@ -20,8 +20,8 @@ def compile_ptx_functions(init_cuda): @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir From 27ec6d3dabd76238da0974c945c65b7c81ae7c22 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 13:26:01 -0800 Subject: [PATCH 042/111] add release notes --- cuda_core/docs/source/release.md | 1 + cuda_core/docs/source/release/0.1.1-notes.md | 0 2 files changed, 1 insertion(+) create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e24786..55090b0b 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -5,5 +5,6 @@ maxdepth: 3 --- + 0.1.1 0.1.0 ``` diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md new file mode 100644 index 00000000..e69de29b From 42c4b45241f2c5f08ca96c7560788fec769ef1c0 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 13:28:58 -0800 Subject: [PATCH 043/111] make true the default path --- cuda_core/cuda/core/experimental/_device.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 889c20a0..88676cf6 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -64,10 +64,10 @@ def __new__(cls, device_id=None): dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0: - dev._mr = _SynchronousMemoryResource(dev_id) - else: + if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1: dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) dev._has_inited = False _tls.devices.append(dev) From 64b1f22e9fae282739c6cf9aaf4005a0f289914b Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 15:38:41 -0800 Subject: [PATCH 044/111] minor rewording --- cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index e69de29b..d80e6ef4 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -0,0 +1,7 @@ +# `cuda.core` Release notes + +Released on Dec X, 2024 + +## Hightlights +- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools + From a7f8c309ad84245b26333062c473baf5326ae191 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 4 Dec 2024 01:14:28 +0000 Subject: [PATCH 045/111] WIP: enable cuLink APIs from driver --- cuda_core/cuda/core/experimental/_linker.py | 253 +++++++++++++++----- cuda_core/tests/test_linker.py | 32 +-- 2 files changed, 209 insertions(+), 76 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index bb66adde..57a10866 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -2,13 +2,64 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import ctypes import weakref from dataclasses import dataclass from typing import List, Optional -from cuda.bindings import nvjitlink +from cuda import cuda from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import check_or_create_options +from cuda.core.experimental._utils import check_or_create_options, handle_return + +# TODO: revisit this treatment for py313t builds +_driver = None # populated if nvJitLink cannot be used +_driver_input_types = None # populated if nvJitLink cannot be used +_driver_ver = None +_inited = False +_nvjitlink = None # populated if nvJitLink can be used +_nvjitlink_input_types = None # populated if nvJitLink cannot be used + + +def _lazy_init(): + global _inited + if _inited: + return + + global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) + try: + from cuda.bindings import nvjitlink + from cuda.bindings._internal import nvjitlink as inner_nvjitlink + except ImportError: + # binding is not available + nvjitlink = None + else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + nvjitlink = None + elif _driver_ver > nvjitlink.version(): + # TODO: nvJitLink is not new enough, warn? + pass + if nvjitlink: + _nvjitlink = nvjitlink + _nvjitlink_input_types = { + "ptx": _nvjitlink.InputType.PTX, + "cubin": _nvjitlink.InputType.CUBIN, + "fatbin": _nvjitlink.InputType.FATBIN, + "ltoir": _nvjitlink.InputType.LTOIR, + "object": _nvjitlink.InputType.OBJECT, + } + else: + from cuda import cuda as _driver + + _driver_input_types = { + "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, + "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, + "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, + } + _inited = True @dataclass @@ -146,7 +197,14 @@ class LinkerOptions: no_cache: Optional[bool] = None def __post_init__(self): + _lazy_init() self.formatted_options = [] + if _nvjitlink: + self._init_nvjitlink() + else: + self._init_driver() + + def _init_nvjitlink(self): if self.arch is not None: self.formatted_options.append(f"-arch={self.arch}") if self.max_register_count is not None: @@ -191,6 +249,67 @@ def __post_init__(self): if self.no_cache is not None: self.formatted_options.append("-no-cache") + def _init_driver(self): + self.option_keys = [] + # allocate 4 KiB each for info/error logs + size = 4194304 + self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) + self.option_keys.extend( + ( + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + ) + ) + + if self.arch is not None: + arch = self.arch.split("_")[-1].upper() + self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) + self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) + # if self.max_register_count is not None: + # self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + # if self.time is not None: + # self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append(1) # ctypes.c_int32(1)) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) + # if self.link_time_optimization is not None: + # self.formatted_options.append("-lto") + # if self.ptx is not None: + # self.formatted_options.append("-ptx") + # if self.optimization_level is not None: + # self.formatted_options.append(f"-O{self.optimization_level}") + # if self.debug is not None: + # self.formatted_options.append("-g") + # if self.lineinfo is not None: + # self.formatted_options.append("-lineinfo") + # if self.ftz is not None: + # self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + # if self.prec_div is not None: + # self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + # if self.prec_sqrt is not None: + # self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + # if self.fma is not None: + # self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + # if self.kernels_used is not None: + # for kernel in self.kernels_used: + # self.formatted_options.append(f"-kernels-used={kernel}") + # if self.variables_used is not None: + # for variable in self.variables_used: + # self.formatted_options.append(f"-variables-used={variable}") + # if self.optimize_unused_variables is not None: + # self.formatted_options.append("-optimize-unused-variables") + # if self.xptxas is not None: + # for opt in self.xptxas: + # self.formatted_options.append(f"-Xptxas={opt}") + # if self.split_compile is not None: + # self.formatted_options.append(f"-split-compile={self.split_compile}") + # if self.split_compile_extended is not None: + # self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + # if self.no_cache is not None: + # self.formatted_options.append("-no-cache") + class Linker: """ @@ -202,45 +321,41 @@ class Linker: One or more ObjectCode objects to be linked. options : LinkerOptions, optional Options for the linker. If not provided, default options will be used. - - Attributes - ---------- - _options : LinkerOptions - The options used for the linker. - _handle : handle - The handle to the linker created by nvjitlink. - - Methods - ------- - _add_code_object(object_code) - Adds an object code to the linker. - close() - Closes the linker and releases resources. """ class _MembersNeededForFinalize: - __slots__ = ("handle",) + __slots__ = ("handle", "use_nvjitlink") - def __init__(self, program_obj, handle): + def __init__(self, program_obj, handle, use_nvjitlink): self.handle = handle + self.use_nvjitlink = use_nvjitlink weakref.finalize(program_obj, self.close) def close(self): if self.handle is not None: - nvjitlink.destroy(self.handle) + if self.use_nvjitlink: + _nvjitlink.destroy(self.handle) + else: + handle_return(_driver.cuLinkDestroy(self.handle)) self.handle = None __slots__ = ("__weakref__", "_mnff", "_options") def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - self._mnff = Linker._MembersNeededForFinalize( - self, nvjitlink.create(len(options.formatted_options), options.formatted_options) - ) - if len(object_codes) == 0: raise ValueError("At least one ObjectCode object must be provided") + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) + use_nvjitlink = False + self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) + for code in object_codes: assert isinstance(code, ObjectCode) self._add_code_object(code) @@ -248,56 +363,74 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) - nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - - _get_linked_methods = { - "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin), - "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx), - } + if _nvjitlink: + _nvjitlink.add_data( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) + ) def link(self, target_type) -> ObjectCode: - nvjitlink.complete(self._mnff.handle) - get_linked = self._get_linked_methods.get(target_type) - if get_linked is None: + if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx - get_size, get_code = get_linked - size = get_size(self._mnff.handle) - code = bytearray(size) - get_code(self._mnff.handle, code) + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) + else: + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: - log_size = nvjitlink.get_error_log_size(self._mnff.handle) - log = bytearray(log_size) - nvjitlink.get_error_log(self._mnff.handle, log) + if _nvjitlink: + log_size = _nvjitlink.get_error_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_error_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[2] return log.decode() def get_info_log(self) -> str: - log_size = nvjitlink.get_info_log_size(self._mnff.handle) - log = bytearray(log_size) - nvjitlink.get_info_log(self._mnff.handle, log) + if _nvjitlink: + log_size = _nvjitlink.get_info_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_info_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[0] return log.decode() - _input_types = { - "ptx": nvjitlink.InputType.PTX, - "cubin": nvjitlink.InputType.CUBIN, - "fatbin": nvjitlink.InputType.FATBIN, - "ltoir": nvjitlink.InputType.LTOIR, - "object": nvjitlink.InputType.OBJECT, - } - - def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: + def _input_type_from_code_type(self, code_type: str): # this list is based on the supported values for code_type in the ObjectCode class definition. - # nvjitlink supports other options for input type - input_type = self._input_types.get(code_type) + # nvJitLink/driver support other options for input type + if _nvjitlink: + input_type = _nvjitlink_input_types.get(code_type) + else: + input_type = _driver_input_types.get(code_type) if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 7db6ed9f..4d10f423 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -31,23 +31,23 @@ def compile_ltoir_functions(init_cuda): "options", [ LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, time=True), + # LinkerOptions(arch=ARCH, max_register_count=32), + # LinkerOptions(arch=ARCH, time=True), LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, ftz=True), - LinkerOptions(arch=ARCH, prec_div=True), - LinkerOptions(arch=ARCH, prec_sqrt=True), - LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, variables_used=["var1"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), - LinkerOptions(arch=ARCH, no_cache=True), + # LinkerOptions(arch=ARCH, optimization_level=3), + # LinkerOptions(arch=ARCH, debug=True), + # LinkerOptions(arch=ARCH, lineinfo=True), + # LinkerOptions(arch=ARCH, ftz=True), + # LinkerOptions(arch=ARCH, prec_div=True), + # LinkerOptions(arch=ARCH, prec_sqrt=True), + # LinkerOptions(arch=ARCH, fma=True), + # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + # LinkerOptions(arch=ARCH, variables_used=["var1"]), + # LinkerOptions(arch=ARCH, optimize_unused_variables=True), + # LinkerOptions(arch=ARCH, xptxas=["-v"]), + # LinkerOptions(arch=ARCH, split_compile=0), + # LinkerOptions(arch=ARCH, split_compile_extended=1), + # LinkerOptions(arch=ARCH, no_cache=True), ], ) def test_linker_init(compile_ptx_functions, options): From 028a5c234b4a40e6298ea0e0a4d950013e20ebf5 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 18:23:31 -0800 Subject: [PATCH 046/111] save progress to remote --- cuda_core/cuda/core/experimental/_linker.py | 104 +++++++++++--------- cuda_core/tests/test_linker.py | 32 +++--- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 57a10866..304b3771 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -29,6 +29,7 @@ def _lazy_init(): _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: + raise ImportError from cuda.bindings import nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: @@ -267,48 +268,66 @@ def _init_driver(self): arch = self.arch.split("_")[-1].upper() self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) - # if self.max_register_count is not None: - # self.formatted_options.append(f"-maxrregcount={self.max_register_count}") - # if self.time is not None: - # self.formatted_options.append("-time") + if self.max_register_count is not None: + self.formatted_options.append(self.max_register_count) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) + if self.time is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME) if self.verbose is not None: - self.formatted_options.append(1) # ctypes.c_int32(1)) + self.formatted_options.append(1) # ctypes.c_int32(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) - # if self.link_time_optimization is not None: - # self.formatted_options.append("-lto") - # if self.ptx is not None: - # self.formatted_options.append("-ptx") - # if self.optimization_level is not None: - # self.formatted_options.append(f"-O{self.optimization_level}") - # if self.debug is not None: - # self.formatted_options.append("-g") - # if self.lineinfo is not None: - # self.formatted_options.append("-lineinfo") - # if self.ftz is not None: - # self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") - # if self.prec_div is not None: - # self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") - # if self.prec_sqrt is not None: - # self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") - # if self.fma is not None: - # self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") - # if self.kernels_used is not None: - # for kernel in self.kernels_used: - # self.formatted_options.append(f"-kernels-used={kernel}") - # if self.variables_used is not None: - # for variable in self.variables_used: - # self.formatted_options.append(f"-variables-used={variable}") - # if self.optimize_unused_variables is not None: - # self.formatted_options.append("-optimize-unused-variables") - # if self.xptxas is not None: - # for opt in self.xptxas: - # self.formatted_options.append(f"-Xptxas={opt}") - # if self.split_compile is not None: - # self.formatted_options.append(f"-split-compile={self.split_compile}") - # if self.split_compile_extended is not None: - # self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - # if self.no_cache is not None: - # self.formatted_options.append("-no-cache") + if self.link_time_optimization is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) + if self.ptx is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.optimization_level is not None: + self.formatted_options.append(self.optimization_level) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) + if self.debug is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) + if self.lineinfo is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.ftz is not None: + self.formatted_options.append(1 if self.ftz else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ) + if self.prec_div is not None: + self.formatted_options.append(1 if self.prec_div else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV) + if self.prec_sqrt is not None: + self.formatted_options.append(1 if self.prec_sqrt else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT) + if self.fma is not None: + self.formatted_options.append(1 if self.fma else 0) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(kernel) + self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(variable) + self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) + if self.optimize_unused_variables is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(opt) + self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE) + if self.split_compile is not None: + self.formatted_options.append(self.split_compile) + self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK) + if self.split_compile_extended is not None: + self.formatted_options.append(self.split_compile_extended) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) + if self.no_cache is not None: + self.formatted_options.append(1) # ctypes.c_int32(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) class Linker: @@ -427,10 +446,7 @@ def get_info_log(self) -> str: def _input_type_from_code_type(self, code_type: str): # this list is based on the supported values for code_type in the ObjectCode class definition. # nvJitLink/driver support other options for input type - if _nvjitlink: - input_type = _nvjitlink_input_types.get(code_type) - else: - input_type = _driver_input_types.get(code_type) + input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) if input_type is None: raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index ac7a5012..1851c7ba 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -31,22 +31,22 @@ def compile_ltoir_functions(init_cuda): "options", [ LinkerOptions(arch=ARCH), - # LinkerOptions(arch=ARCH, max_register_count=32), - # LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, time=True), LinkerOptions(arch=ARCH, verbose=True), - # LinkerOptions(arch=ARCH, optimization_level=3), - # LinkerOptions(arch=ARCH, debug=True), - # LinkerOptions(arch=ARCH, lineinfo=True), - # LinkerOptions(arch=ARCH, ftz=True), - # LinkerOptions(arch=ARCH, prec_div=True), - # LinkerOptions(arch=ARCH, prec_sqrt=True), - # LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), # LinkerOptions(arch=ARCH, variables_used=["var1"]), - # LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), # LinkerOptions(arch=ARCH, xptxas=["-v"]), # LinkerOptions(arch=ARCH, split_compile=0), - # LinkerOptions(arch=ARCH, split_compile_extended=1), + LinkerOptions(arch=ARCH, split_compile_extended=1), # LinkerOptions(arch=ARCH, no_cache=True), ], ) @@ -62,11 +62,11 @@ def test_linker_init_invalid_arch(): Linker(options) -def test_linker_link_ptx(compile_ltoir_functions): - options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) - linker = Linker(*compile_ltoir_functions, options=options) - linked_code = linker.link("ptx") - assert isinstance(linked_code, ObjectCode) +# def test_linker_link_ptx(compile_ltoir_functions): +# options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) +# linker = Linker(*compile_ltoir_functions, options=options) +# linked_code = linker.link("ptx") +# assert isinstance(linked_code, ObjectCode) def test_linker_link_cubin(compile_ptx_functions): From d7bf4cb304404d6b001fa0e5df479a6d1f9fd514 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 3 Dec 2024 18:28:38 -0800 Subject: [PATCH 047/111] save progress to remote --- cuda_core/cuda/core/experimental/_linker.py | 10 +++------- cuda_core/tests/test_linker.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 304b3771..79328583 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -306,22 +306,18 @@ def _init_driver(self): self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) if self.kernels_used is not None: for kernel in self.kernels_used: - self.formatted_options.append(kernel) + self.formatted_options.append(kernel.encode()) self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) if self.variables_used is not None: for variable in self.variables_used: - self.formatted_options.append(variable) + self.formatted_options.append(variable.encode()) self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) if self.optimize_unused_variables is not None: self.formatted_options.append(1) # ctypes.c_int32(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) if self.xptxas is not None: for opt in self.xptxas: - self.formatted_options.append(opt) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE) - if self.split_compile is not None: - self.formatted_options.append(self.split_compile) - self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK) + raise NotImplementedError("TODO: implement xptxas option") if self.split_compile_extended is not None: self.formatted_options.append(self.split_compile_extended) self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 1851c7ba..3937c878 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -41,8 +41,8 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), - # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - # LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), # LinkerOptions(arch=ARCH, xptxas=["-v"]), # LinkerOptions(arch=ARCH, split_compile=0), From 84124b590f62c089503b5ab44a9608a83a151ac4 Mon Sep 17 00:00:00 2001 From: sandeepd-nv Date: Wed, 4 Dec 2024 09:49:20 +0530 Subject: [PATCH 048/111] Run cuda_core tests before cuda_binding. --- continuous_integration/scripts/test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test index cbee6998..3a705c3c 100755 --- a/continuous_integration/scripts/test +++ b/continuous_integration/scripts/test @@ -11,10 +11,10 @@ test_ci() { cd "${CORE_ARTIFACTS_DIR}" pip install *.whl - cd "${REPO_DIR}/cuda_bindings" + cd "${REPO_DIR}/cuda_core" python -m pytest tests/ - cd "${REPO_DIR}/cuda_core" + cd "${REPO_DIR}/cuda_bindings" python -m pytest tests/ } From 1d80ca70f1ef4d0435d4aa49ee97d9ec8b254588 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 08:48:38 -0800 Subject: [PATCH 049/111] fix some known issues before colossus test --- cuda_core/cuda/core/experimental/_device.py | 6 +++++- cuda_core/cuda/core/experimental/_memory.py | 8 ++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 88676cf6..db5f57cf 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -64,7 +64,11 @@ def __new__(cls, device_id=None): dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1: + if ( + handle_return( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + ) + ) == 1: dev._mr = _DefaultAsyncMempool(dev_id) else: dev._mr = _SynchronousMemoryResource(dev_id) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index ac6a78fe..5ff00ba2 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -303,15 +303,11 @@ def __init__(self, dev_id): self._dev_id = dev_id def allocate(self, size, stream=None) -> Buffer: - if stream is None: - stream = default_stream() - ptr = handle_return(cuda.cuMemAlloc(size, stream._handle)) + ptr = handle_return(cuda.cuMemAlloc(size)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): - if stream is None: - stream = default_stream() - handle_return(cuda.cuMemFree(ptr, stream._handle)) + handle_return(cuda.cuMemFree(ptr)) @property def is_device_accessible(self) -> bool: From cd7f146bf0f8f13a0327fa8d2b0a315410819350 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 4 Dec 2024 10:23:20 -0800 Subject: [PATCH 050/111] convert line endings from CRLF to LF --- .gitattributes | 2 + .pre-commit-config.yaml | 24 +- .../benchmarks/test_launch_latency.py | 682 +++++++++--------- cuda_bindings/tests/test_nvjitlink.py | 336 ++++----- .../example_tests/test_basic_examples.py | 50 +- cuda_core/tests/example_tests/utils.py | 112 +-- cuda_core/tests/test_device.py | 160 ++-- cuda_core/tests/test_event.py | 92 +-- cuda_core/tests/test_launcher.py | 136 ++-- cuda_core/tests/test_memory.py | 426 +++++------ cuda_core/tests/test_module.py | 96 +-- cuda_core/tests/test_program.py | 132 ++-- cuda_core/tests/test_stream.py | 230 +++--- 13 files changed, 1240 insertions(+), 1238 deletions(-) diff --git a/.gitattributes b/.gitattributes index 00407cdc..aeb32006 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,5 @@ cuda/_version.py export-subst # we do not own any headers checked in, don't touch them *.h binary *.hpp binary +# git should not convert line endings in PNG files +*.png binary diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 431bb7c5..c2d246aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -repos: - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.4 - hooks: - - id: ruff - args: [--fix, --show-fixes] - - id: ruff-format - -default_language_version: - python: python3 +# Copyright (c) 2024, NVIDIA CORPORATION. + +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.4 + hooks: + - id: ruff + args: [--fix, --show-fixes] + - id: ruff-format + +default_language_version: + python: python3 diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py index 8d70bfe2..f16e971a 100755 --- a/cuda_bindings/benchmarks/test_launch_latency.py +++ b/cuda_bindings/benchmarks/test_launch_latency.py @@ -1,341 +1,341 @@ -# Copyright 2021-2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. -import ctypes - -import pytest - -from cuda import cuda - -from .kernels import kernel_string -from .perf_test_utils import ASSERT_DRV - - -def launch(kernel, stream, args=(), arg_types=()): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - (args, arg_types), - 0, - ) # arguments - - -def launch_packed(kernel, stream, params): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - params, - 0, - ) # arguments - - -# Measure launch latency with no parmaeters -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") - ASSERT_DRV(err) - - benchmark(launch, func, stream) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with a single parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel") - ASSERT_DRV(err) - - err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) - ASSERT_DRV(err) - - benchmark(launch, func, stream, args=(f,), arg_types=(None,)) - - cuda.cuCtxSynchronize() - - (err,) = cuda.cuMemFree(f) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 512 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") - ASSERT_DRV(err) - - args = [True] * 512 - arg_types = [ctypes.c_bool] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") - ASSERT_DRV(err) - - args = [1.2345] * 512 - arg_types = [ctypes.c_double] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") - ASSERT_DRV(err) - - args = [123] * 512 - arg_types = [ctypes.c_int] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") - ASSERT_DRV(err) - - args = [127] * 512 - arg_types = [ctypes.c_byte] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") - ASSERT_DRV(err) - - args = [9223372036854775806] * 512 - arg_types = [ctypes.c_longlong] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 256 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 16 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters, excluding parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - val_ps = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - val_ps.append(ctypes.c_void_p(int(vals[i]))) - - packagedParams = (ctypes.c_void_p * 512)() - for i in range(512): - packagedParams[i] = ctypes.addressof(val_ps[i]) - - benchmark(launch_packed, func, stream, packagedParams) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -def pack_and_launch(kernel, stream, params): - packed_params = (ctypes.c_void_p * len(params))() - ptrs = [0] * len(params) - for i in range(len(params)): - ptrs[i] = ctypes.c_void_p(int(params[i])) - packed_params[i] = ctypes.addressof(ptrs[i]) - - cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) - - -# Measure launch latency plus parameter packing using ctypes -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - - benchmark(pack_and_launch, func, stream, vals) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with a single large struct parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") - ASSERT_DRV(err) - - class struct_2048B(ctypes.Structure): - _fields_ = [("values", ctypes.c_uint8 * 2048)] - - benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) - - cuda.cuCtxSynchronize() +# Copyright 2021-2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. +import ctypes + +import pytest + +from cuda import cuda + +from .kernels import kernel_string +from .perf_test_utils import ASSERT_DRV + + +def launch(kernel, stream, args=(), arg_types=()): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (args, arg_types), + 0, + ) # arguments + + +def launch_packed(kernel, stream, params): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + params, + 0, + ) # arguments + + +# Measure launch latency with no parmaeters +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") + ASSERT_DRV(err) + + benchmark(launch, func, stream) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with a single parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel") + ASSERT_DRV(err) + + err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) + ASSERT_DRV(err) + + benchmark(launch, func, stream, args=(f,), arg_types=(None,)) + + cuda.cuCtxSynchronize() + + (err,) = cuda.cuMemFree(f) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 512 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") + ASSERT_DRV(err) + + args = [True] * 512 + arg_types = [ctypes.c_bool] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") + ASSERT_DRV(err) + + args = [1.2345] * 512 + arg_types = [ctypes.c_double] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") + ASSERT_DRV(err) + + args = [123] * 512 + arg_types = [ctypes.c_int] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") + ASSERT_DRV(err) + + args = [127] * 512 + arg_types = [ctypes.c_byte] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") + ASSERT_DRV(err) + + args = [9223372036854775806] * 512 + arg_types = [ctypes.c_longlong] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 256 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 16 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters, excluding parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + val_ps = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + val_ps.append(ctypes.c_void_p(int(vals[i]))) + + packagedParams = (ctypes.c_void_p * 512)() + for i in range(512): + packagedParams[i] = ctypes.addressof(val_ps[i]) + + benchmark(launch_packed, func, stream, packagedParams) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +def pack_and_launch(kernel, stream, params): + packed_params = (ctypes.c_void_p * len(params))() + ptrs = [0] * len(params) + for i in range(len(params)): + ptrs[i] = ctypes.c_void_p(int(params[i])) + packed_params[i] = ctypes.addressof(ptrs[i]) + + cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) + + +# Measure launch latency plus parameter packing using ctypes +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + + benchmark(pack_and_launch, func, stream, vals) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with a single large struct parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") + ASSERT_DRV(err) + + class struct_2048B(ctypes.Structure): + _fields_ = [("values", ctypes.c_uint8 * 2048)] + + benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) + + cuda.cuCtxSynchronize() diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py index d92a3ca7..839c7be1 100644 --- a/cuda_bindings/tests/test_nvjitlink.py +++ b/cuda_bindings/tests/test_nvjitlink.py @@ -1,168 +1,168 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import pytest - -from cuda.bindings import nvjitlink, nvrtc - -# Establish a handful of compatible architectures and PTX versions to test with -ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"] -PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"] - - -def ptx_header(version, arch): - return f""" -.version {version} -.target {arch} -.address_size 64 -""" - - -ptx_kernel = """ -.visible .entry _Z6kernelPi( - .param .u64 _Z6kernelPi_param_0 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<3>; - .reg .b64 %rd<3>; - - ld.param.u64 %rd1, [_Z6kernelPi_param_0]; - cvta.to.global.u64 %rd2, %rd1; - mov.u32 %r1, %tid.x; - st.global.u32 [%rd2+0], %r1; - ret; -} -""" - -minimal_ptx_kernel = """ -.func _MinimalKernel() -{ - ret; -} -""" - -ptx_kernel_bytes = [ - (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) -] -minimal_ptx_kernel_bytes = [ - (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8") - for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) -] - - -# create a valid LTOIR input for testing -@pytest.fixture -def get_dummy_ltoir(): - def CHECK_NVRTC(err): - if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError(f"Nvrtc Error: {err}") - - empty_cplusplus_kernel = "__global__ void A() {}" - err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], []) - CHECK_NVRTC(err) - nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"]) - err, size = nvrtc.nvrtcGetLTOIRSize(program_handle) - CHECK_NVRTC(err) - empty_kernel_ltoir = b" " * size - (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir) - CHECK_NVRTC(err) - (err,) = nvrtc.nvrtcDestroyProgram(program_handle) - CHECK_NVRTC(err) - return empty_kernel_ltoir - - -def test_unrecognized_option_error(): - with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): - nvjitlink.create(1, ["-fictitious_option"]) - - -def test_invalid_arch_error(): - with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): - nvjitlink.create(1, ["-arch=sm_XX"]) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_create_and_destroy(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - assert handle != 0 - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_complete_empty(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_add_data(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_add_file(option, ptx_bytes, tmp_path): - handle = nvjitlink.create(1, [f"-arch={option}"]) - file_path = tmp_path / "test_file.cubin" - file_path.write_bytes(ptx_bytes) - nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path)) - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_get_error_log(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.complete(handle) - log_size = nvjitlink.get_error_log_size(handle) - log = bytearray(log_size) - nvjitlink.get_error_log(handle, log) - assert len(log) == log_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_get_info_log(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - log_size = nvjitlink.get_info_log_size(handle) - log = bytearray(log_size) - nvjitlink.get_info_log(handle, log) - assert len(log) == log_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_get_linked_cubin(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - cubin_size = nvjitlink.get_linked_cubin_size(handle) - cubin = bytearray(cubin_size) - nvjitlink.get_linked_cubin(handle, cubin) - assert len(cubin) == cubin_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_get_linked_ptx(option, get_dummy_ltoir): - handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"]) - nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data") - nvjitlink.complete(handle) - ptx_size = nvjitlink.get_linked_ptx_size(handle) - ptx = bytearray(ptx_size) - nvjitlink.get_linked_ptx(handle, ptx) - assert len(ptx) == ptx_size - nvjitlink.destroy(handle) - - -def test_package_version(): - ver = nvjitlink.version() - assert len(ver) == 2 - assert ver >= (12, 0) +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import pytest + +from cuda.bindings import nvjitlink, nvrtc + +# Establish a handful of compatible architectures and PTX versions to test with +ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"] +PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"] + + +def ptx_header(version, arch): + return f""" +.version {version} +.target {arch} +.address_size 64 +""" + + +ptx_kernel = """ +.visible .entry _Z6kernelPi( + .param .u64 _Z6kernelPi_param_0 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<3>; + .reg .b64 %rd<3>; + + ld.param.u64 %rd1, [_Z6kernelPi_param_0]; + cvta.to.global.u64 %rd2, %rd1; + mov.u32 %r1, %tid.x; + st.global.u32 [%rd2+0], %r1; + ret; +} +""" + +minimal_ptx_kernel = """ +.func _MinimalKernel() +{ + ret; +} +""" + +ptx_kernel_bytes = [ + (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) +] +minimal_ptx_kernel_bytes = [ + (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8") + for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) +] + + +# create a valid LTOIR input for testing +@pytest.fixture +def get_dummy_ltoir(): + def CHECK_NVRTC(err): + if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: + raise RuntimeError(f"Nvrtc Error: {err}") + + empty_cplusplus_kernel = "__global__ void A() {}" + err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], []) + CHECK_NVRTC(err) + nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"]) + err, size = nvrtc.nvrtcGetLTOIRSize(program_handle) + CHECK_NVRTC(err) + empty_kernel_ltoir = b" " * size + (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir) + CHECK_NVRTC(err) + (err,) = nvrtc.nvrtcDestroyProgram(program_handle) + CHECK_NVRTC(err) + return empty_kernel_ltoir + + +def test_unrecognized_option_error(): + with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): + nvjitlink.create(1, ["-fictitious_option"]) + + +def test_invalid_arch_error(): + with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): + nvjitlink.create(1, ["-arch=sm_XX"]) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_create_and_destroy(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + assert handle != 0 + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_complete_empty(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_add_data(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_add_file(option, ptx_bytes, tmp_path): + handle = nvjitlink.create(1, [f"-arch={option}"]) + file_path = tmp_path / "test_file.cubin" + file_path.write_bytes(ptx_bytes) + nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path)) + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_get_error_log(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.complete(handle) + log_size = nvjitlink.get_error_log_size(handle) + log = bytearray(log_size) + nvjitlink.get_error_log(handle, log) + assert len(log) == log_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_get_info_log(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + log_size = nvjitlink.get_info_log_size(handle) + log = bytearray(log_size) + nvjitlink.get_info_log(handle, log) + assert len(log) == log_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_get_linked_cubin(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + cubin_size = nvjitlink.get_linked_cubin_size(handle) + cubin = bytearray(cubin_size) + nvjitlink.get_linked_cubin(handle, cubin) + assert len(cubin) == cubin_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_get_linked_ptx(option, get_dummy_ltoir): + handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"]) + nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data") + nvjitlink.complete(handle) + ptx_size = nvjitlink.get_linked_ptx_size(handle) + ptx = bytearray(ptx_size) + nvjitlink.get_linked_ptx(handle, ptx) + assert len(ptx) == ptx_size + nvjitlink.destroy(handle) + + +def test_package_version(): + ver = nvjitlink.version() + assert len(ver) == 2 + assert ver >= (12, 0) diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 9b94ecd3..9a9432cb 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -1,25 +1,25 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -# If we have subcategories of examples in the future, this file can be split along those lines - -import glob -import os - -import pytest - -from .utils import run_example - -samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples") -sample_files = glob.glob(samples_path + "**/*.py", recursive=True) - - -@pytest.mark.parametrize("example", sample_files) -class TestExamples: - def test_example(self, example, deinit_cuda): - run_example(samples_path, example) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +# If we have subcategories of examples in the future, this file can be split along those lines + +import glob +import os + +import pytest + +from .utils import run_example + +samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples") +sample_files = glob.glob(samples_path + "**/*.py", recursive=True) + + +@pytest.mark.parametrize("example", sample_files) +class TestExamples: + def test_example(self, example, deinit_cuda): + run_example(samples_path, example) diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py index f6ac3e15..3d218a91 100644 --- a/cuda_core/tests/example_tests/utils.py +++ b/cuda_core/tests/example_tests/utils.py @@ -1,56 +1,56 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import gc -import os -import sys - -import cupy as cp -import pytest - - -class SampleTestError(Exception): - pass - - -def parse_python_script(filepath): - if not filepath.endswith(".py"): - raise ValueError(f"{filepath} not supported") - with open(filepath, encoding="utf-8") as f: - script = f.read() - return script - - -def run_example(samples_path, filename, env=None): - fullpath = os.path.join(samples_path, filename) - script = parse_python_script(fullpath) - try: - old_argv = sys.argv - sys.argv = [fullpath] - old_sys_path = sys.path.copy() - sys.path.append(samples_path) - exec(script, env if env else {}) - except ImportError as e: - # for samples requiring any of optional dependencies - for m in ("cupy",): - if f"No module named '{m}'" in str(e): - pytest.skip(f"{m} not installed, skipping related tests") - break - else: - raise - except Exception as e: - msg = "\n" - msg += f"Got error ({filename}):\n" - msg += str(e) - raise SampleTestError(msg) from e - finally: - sys.path = old_sys_path - sys.argv = old_argv - # further reduce the memory watermark - gc.collect() - cp.get_default_memory_pool().free_all_blocks() +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import gc +import os +import sys + +import cupy as cp +import pytest + + +class SampleTestError(Exception): + pass + + +def parse_python_script(filepath): + if not filepath.endswith(".py"): + raise ValueError(f"{filepath} not supported") + with open(filepath, encoding="utf-8") as f: + script = f.read() + return script + + +def run_example(samples_path, filename, env=None): + fullpath = os.path.join(samples_path, filename) + script = parse_python_script(fullpath) + try: + old_argv = sys.argv + sys.argv = [fullpath] + old_sys_path = sys.path.copy() + sys.path.append(samples_path) + exec(script, env if env else {}) + except ImportError as e: + # for samples requiring any of optional dependencies + for m in ("cupy",): + if f"No module named '{m}'" in str(e): + pytest.skip(f"{m} not installed, skipping related tests") + break + else: + raise + except Exception as e: + msg = "\n" + msg += f"Got error ({filename}):\n" + msg += str(e) + raise SampleTestError(msg) from e + finally: + sys.path = old_sys_path + sys.argv = old_argv + # further reduce the memory watermark + gc.collect() + cp.get_default_memory_pool().free_all_blocks() diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index afc3ed5b..876299f3 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -1,80 +1,80 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -try: - from cuda.bindings import driver, runtime -except ImportError: - from cuda import cuda as driver - from cuda import cudart as runtime - -from cuda.core.experimental import Device -from cuda.core.experimental._utils import ComputeCapability, handle_return - - -def test_device_set_current(deinit_cuda): - device = Device() - device.set_current() - assert handle_return(driver.cuCtxGetCurrent()) is not None - - -def test_device_repr(): - device = Device(0) - assert str(device).startswith("= 11040: - uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id)) - else: - uuid = handle_return(driver.cuDeviceGetUuid(device.device_id)) - uuid = uuid.bytes.hex() - expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" - assert device.uuid == expected_uuid - - -def test_name(): - device = Device() - name = handle_return(driver.cuDeviceGetName(128, device.device_id)) - name = name.split(b"\0")[0] - assert device.name == name.decode() - - -def test_compute_capability(): - device = Device() - major = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id) - ) - minor = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id) - ) - expected_cc = ComputeCapability(major, minor) - assert device.compute_capability == expected_cc +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +try: + from cuda.bindings import driver, runtime +except ImportError: + from cuda import cuda as driver + from cuda import cudart as runtime + +from cuda.core.experimental import Device +from cuda.core.experimental._utils import ComputeCapability, handle_return + + +def test_device_set_current(deinit_cuda): + device = Device() + device.set_current() + assert handle_return(driver.cuCtxGetCurrent()) is not None + + +def test_device_repr(): + device = Device(0) + assert str(device).startswith("= 11040: + uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id)) + else: + uuid = handle_return(driver.cuDeviceGetUuid(device.device_id)) + uuid = uuid.bytes.hex() + expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" + assert device.uuid == expected_uuid + + +def test_name(): + device = Device() + name = handle_return(driver.cuDeviceGetName(128, device.device_id)) + name = name.split(b"\0")[0] + assert device.name == name.decode() + + +def test_compute_capability(): + device = Device() + major = handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id) + ) + minor = handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id) + ) + expected_cc = ComputeCapability(major, minor) + assert device.compute_capability == expected_cc diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 21548078..0d650b4f 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -1,46 +1,46 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, EventOptions - - -@pytest.mark.parametrize("enable_timing", [True, False, None]) -def test_timing(init_cuda, enable_timing): - options = EventOptions(enable_timing=enable_timing) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True) - - -def test_is_sync_busy_waited(init_cuda): - options = EventOptions(enable_timing=False, busy_waited_sync=True) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_sync_busy_waited is True - - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_sync_busy_waited is False - - -def test_sync(init_cuda): - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - event.sync() - assert event.is_done is True - - -def test_is_done(init_cuda): - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_done is True +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, EventOptions + + +@pytest.mark.parametrize("enable_timing", [True, False, None]) +def test_timing(init_cuda, enable_timing): + options = EventOptions(enable_timing=enable_timing) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True) + + +def test_is_sync_busy_waited(init_cuda): + options = EventOptions(enable_timing=False, busy_waited_sync=True) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_sync_busy_waited is True + + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_sync_busy_waited is False + + +def test_sync(init_cuda): + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + event.sync() + assert event.is_done is True + + +def test_is_done(init_cuda): + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_done is True diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 874d7f07..08f7e6d3 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -1,68 +1,68 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, LaunchConfig, Stream - - -def test_launch_config_init(init_cuda): - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0) - assert config.grid == (1, 1, 1) - assert config.block == (1, 1, 1) - assert config.stream is None - assert config.shmem_size == 0 - - config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024) - assert config.grid == (2, 2, 2) - assert config.block == (2, 2, 2) - assert isinstance(config.stream, Stream) - assert config.shmem_size == 1024 - - -def test_launch_config_cast_to_3_tuple(): - config = LaunchConfig(grid=1, block=1) - assert config._cast_to_3_tuple(1) == (1, 1, 1) - assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1) - assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3) - - # Edge cases - assert config._cast_to_3_tuple(999) == (999, 1, 1) - assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1) - assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777) - - -def test_launch_config_invalid_values(): - with pytest.raises(ValueError): - LaunchConfig(grid=0, block=1) - - with pytest.raises(ValueError): - LaunchConfig(grid=(0, 1), block=1) - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=0) - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=(0, 1)) - - -def test_launch_config_stream(init_cuda): - stream = Device().create_stream() - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0) - assert config.stream == stream - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0) - - -def test_launch_config_shmem_size(): - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048) - assert config.shmem_size == 2048 - - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None) - assert config.shmem_size == 0 +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, LaunchConfig, Stream + + +def test_launch_config_init(init_cuda): + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0) + assert config.grid == (1, 1, 1) + assert config.block == (1, 1, 1) + assert config.stream is None + assert config.shmem_size == 0 + + config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024) + assert config.grid == (2, 2, 2) + assert config.block == (2, 2, 2) + assert isinstance(config.stream, Stream) + assert config.shmem_size == 1024 + + +def test_launch_config_cast_to_3_tuple(): + config = LaunchConfig(grid=1, block=1) + assert config._cast_to_3_tuple(1) == (1, 1, 1) + assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1) + assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3) + + # Edge cases + assert config._cast_to_3_tuple(999) == (999, 1, 1) + assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1) + assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777) + + +def test_launch_config_invalid_values(): + with pytest.raises(ValueError): + LaunchConfig(grid=0, block=1) + + with pytest.raises(ValueError): + LaunchConfig(grid=(0, 1), block=1) + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=0) + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=(0, 1)) + + +def test_launch_config_stream(init_cuda): + stream = Device().create_stream() + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0) + assert config.stream == stream + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0) + + +def test_launch_config_shmem_size(): + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048) + assert config.shmem_size == 2048 + + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None) + assert config.shmem_size == 0 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index c78b5673..a48db69b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,213 +1,213 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - -import ctypes - -from cuda.core.experimental import Device -from cuda.core.experimental._memory import Buffer, MemoryResource -from cuda.core.experimental._utils import handle_return - - -class DummyDeviceMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAlloc(size)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return False - - @property - def device_id(self) -> int: - return 0 - - -class DummyHostMemoryResource(MemoryResource): - def __init__(self): - pass - - def allocate(self, size, stream=None) -> Buffer: - # Allocate a ctypes buffer of size `size` - ptr = (ctypes.c_byte * size)() - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - # the memory is deallocated per the ctypes deallocation at garbage collection time - pass - - @property - def is_device_accessible(self) -> bool: - return False - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - raise RuntimeError("the pinned memory resource is not bound to any GPU") - - -class DummyUnifiedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - return 0 - - -class DummyPinnedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocHost(size)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFreeHost(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - raise RuntimeError("the pinned memory resource is not bound to any GPU") - - -def buffer_initialization(dummy_mr: MemoryResource): - buffer = dummy_mr.allocate(size=1024) - assert buffer.handle != 0 - assert buffer.size == 1024 - assert buffer.memory_resource == dummy_mr - assert buffer.is_device_accessible == dummy_mr.is_device_accessible - assert buffer.is_host_accessible == dummy_mr.is_host_accessible - buffer.close() - - -def test_buffer_initialization(): - device = Device() - device.set_current() - buffer_initialization(DummyDeviceMemoryResource(device)) - buffer_initialization(DummyHostMemoryResource()) - buffer_initialization(DummyUnifiedMemoryResource(device)) - buffer_initialization(DummyPinnedMemoryResource(device)) - - -def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) - stream = device.create_stream() - - if check: - src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - for i in range(1024): - src_ptr[i] = ctypes.c_byte(i) - - src_buffer.copy_to(dst_buffer, stream=stream) - device.sync() - - if check: - dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - - for i in range(10): - assert dst_ptr[i] == src_ptr[i] - - dst_buffer.close() - src_buffer.close() - - -def test_buffer_copy_to(): - device = Device() - device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device) - buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) - - -def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) - stream = device.create_stream() - - if check: - src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - for i in range(1024): - src_ptr[i] = ctypes.c_byte(i) - - dst_buffer.copy_from(src_buffer, stream=stream) - device.sync() - - if check: - dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - - for i in range(10): - assert dst_ptr[i] == src_ptr[i] - - dst_buffer.close() - src_buffer.close() - - -def test_buffer_copy_from(): - device = Device() - device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device) - buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) - - -def buffer_close(dummy_mr: MemoryResource): - buffer = dummy_mr.allocate(size=1024) - buffer.close() - assert buffer.handle == 0 - assert buffer.memory_resource is None - - -def test_buffer_close(): - device = Device() - device.set_current() - buffer_close(DummyDeviceMemoryResource(device)) - buffer_close(DummyHostMemoryResource()) - buffer_close(DummyUnifiedMemoryResource(device)) - buffer_close(DummyPinnedMemoryResource(device)) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +try: + from cuda.bindings import driver +except ImportError: + from cuda import cuda as driver + +import ctypes + +from cuda.core.experimental import Device +from cuda.core.experimental._memory import Buffer, MemoryResource +from cuda.core.experimental._utils import handle_return + + +class DummyDeviceMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAlloc(size)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return 0 + + +class DummyHostMemoryResource(MemoryResource): + def __init__(self): + pass + + def allocate(self, size, stream=None) -> Buffer: + # Allocate a ctypes buffer of size `size` + ptr = (ctypes.c_byte * size)() + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + # the memory is deallocated per the ctypes deallocation at garbage collection time + pass + + @property + def is_device_accessible(self) -> bool: + return False + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +class DummyUnifiedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + return 0 + + +class DummyPinnedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAllocHost(size)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFreeHost(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +def buffer_initialization(dummy_mr: MemoryResource): + buffer = dummy_mr.allocate(size=1024) + assert buffer.handle != 0 + assert buffer.size == 1024 + assert buffer.memory_resource == dummy_mr + assert buffer.is_device_accessible == dummy_mr.is_device_accessible + assert buffer.is_host_accessible == dummy_mr.is_host_accessible + buffer.close() + + +def test_buffer_initialization(): + device = Device() + device.set_current() + buffer_initialization(DummyDeviceMemoryResource(device)) + buffer_initialization(DummyHostMemoryResource()) + buffer_initialization(DummyUnifiedMemoryResource(device)) + buffer_initialization(DummyPinnedMemoryResource(device)) + + +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) + stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + + src_buffer.copy_to(dst_buffer, stream=stream) + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + + dst_buffer.close() + src_buffer.close() + + +def test_buffer_copy_to(): + device = Device() + device.set_current() + buffer_copy_to(DummyDeviceMemoryResource(device), device) + buffer_copy_to(DummyUnifiedMemoryResource(device), device) + buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + + +def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) + stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + + dst_buffer.copy_from(src_buffer, stream=stream) + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + + dst_buffer.close() + src_buffer.close() + + +def test_buffer_copy_from(): + device = Device() + device.set_current() + buffer_copy_from(DummyDeviceMemoryResource(device), device) + buffer_copy_from(DummyUnifiedMemoryResource(device), device) + buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) + + +def buffer_close(dummy_mr: MemoryResource): + buffer = dummy_mr.allocate(size=1024) + buffer.close() + assert buffer.handle == 0 + assert buffer.memory_resource is None + + +def test_buffer_close(): + device = Device() + device.set_current() + buffer_close(DummyDeviceMemoryResource(device)) + buffer_close(DummyHostMemoryResource()) + buffer_close(DummyUnifiedMemoryResource(device)) + buffer_close(DummyPinnedMemoryResource(device)) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 5f0b6056..a976726f 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -1,48 +1,48 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import importlib - -import pytest - -from cuda.core.experimental._module import ObjectCode - - -@pytest.mark.skipif( - int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, - reason="Module loading for older drivers validate require valid module code.", -) -def test_object_code_initialization(): - # Test with supported code types - for code_type in ["cubin", "ptx", "fatbin"]: - module_data = b"dummy_data" - obj_code = ObjectCode(module_data, code_type) - assert obj_code._code_type == code_type - assert obj_code._module == module_data - assert obj_code._handle is not None - - # Test with unsupported code type - with pytest.raises(ValueError): - ObjectCode(b"dummy_data", "unsupported_code_type") - - -# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile -def test_object_code_initialization_with_str(): - assert True - - -def test_object_code_initialization_with_jit_options(): - assert True - - -def test_object_code_get_kernel(): - assert True - - -def test_kernel_from_obj(): - assert True +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import importlib + +import pytest + +from cuda.core.experimental._module import ObjectCode + + +@pytest.mark.skipif( + int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, + reason="Module loading for older drivers validate require valid module code.", +) +def test_object_code_initialization(): + # Test with supported code types + for code_type in ["cubin", "ptx", "fatbin"]: + module_data = b"dummy_data" + obj_code = ObjectCode(module_data, code_type) + assert obj_code._code_type == code_type + assert obj_code._module == module_data + assert obj_code._handle is not None + + # Test with unsupported code type + with pytest.raises(ValueError): + ObjectCode(b"dummy_data", "unsupported_code_type") + + +# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile +def test_object_code_initialization_with_str(): + assert True + + +def test_object_code_initialization_with_jit_options(): + assert True + + +def test_object_code_get_kernel(): + assert True + + +def test_kernel_from_obj(): + assert True diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index af94a7ba..95c4d377 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -1,66 +1,66 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Program -from cuda.core.experimental._module import Kernel, ObjectCode - - -def test_program_init_valid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - assert program.handle is not None - - -def test_program_init_invalid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - with pytest.raises(NotImplementedError): - Program(code, "python") - - -def test_program_init_invalid_code_format(): - code = 12345 - with pytest.raises(TypeError): - Program(code, "c++") - - -def test_program_compile_valid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - object_code = program.compile("ptx") - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) - - -def test_program_compile_invalid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - with pytest.raises(NotImplementedError): - program.compile("invalid_target") - - -def test_program_backend_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - - -def test_program_handle_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.handle is not None - - -def test_program_close(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - program.close() - assert program.handle is None +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Program +from cuda.core.experimental._module import Kernel, ObjectCode + + +def test_program_init_valid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + assert program.handle is not None + + +def test_program_init_invalid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + with pytest.raises(NotImplementedError): + Program(code, "python") + + +def test_program_init_invalid_code_format(): + code = 12345 + with pytest.raises(TypeError): + Program(code, "c++") + + +def test_program_compile_valid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + object_code = program.compile("ptx") + kernel = object_code.get_kernel("my_kernel") + assert isinstance(object_code, ObjectCode) + assert isinstance(kernel, Kernel) + + +def test_program_compile_invalid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + with pytest.raises(NotImplementedError): + program.compile("invalid_target") + + +def test_program_backend_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + + +def test_program_handle_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.handle is not None + + +def test_program_close(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + program.close() + assert program.handle is None diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 03cdd852..9c661192 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -1,115 +1,115 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, Stream, StreamOptions -from cuda.core.experimental._event import Event -from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream - - -def test_stream_init(): - with pytest.raises(NotImplementedError): - Stream() - - -def test_stream_init_with_options(init_cuda): - stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0)) - assert stream.is_nonblocking is True - assert stream.priority == 0 - - -def test_stream_handle(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - assert isinstance(stream.handle, int) - - -def test_stream_is_nonblocking(init_cuda): - stream = Device().create_stream(options=StreamOptions(nonblocking=True)) - assert stream.is_nonblocking is True - - -def test_stream_priority(init_cuda): - stream = Device().create_stream(options=StreamOptions(priority=0)) - assert stream.priority == 0 - stream = Device().create_stream(options=StreamOptions(priority=-1)) - assert stream.priority == -1 - with pytest.raises(ValueError): - stream = Device().create_stream(options=StreamOptions(priority=1)) - - -def test_stream_sync(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - stream.sync() # Should not raise any exceptions - - -def test_stream_record(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - event = stream.record() - assert isinstance(event, Event) - - -def test_stream_record_invalid_event(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(TypeError): - stream.record(event="invalid_event") - - -def test_stream_wait_event(init_cuda): - s1 = Device().create_stream() - s2 = Device().create_stream() - e1 = s1.record() - s2.wait(e1) # Should not raise any exceptions - s2.sync() - - -def test_stream_wait_invalid_event(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(ValueError): - stream.wait(event_or_stream="invalid_event") - - -def test_stream_device(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - device = stream.device - assert isinstance(device, Device) - - -def test_stream_context(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - context = stream.context - assert context is not None - - -def test_stream_from_foreign_stream(init_cuda): - device = Device() - other_stream = device.create_stream(options=StreamOptions()) - stream = device.create_stream(obj=other_stream) - assert other_stream.handle == stream.handle - device = stream.device - assert isinstance(device, Device) - context = stream.context - assert context is not None - - -def test_stream_from_handle(): - stream = Stream.from_handle(0) - assert isinstance(stream, Stream) - - -def test_legacy_default_stream(): - assert isinstance(LEGACY_DEFAULT_STREAM, Stream) - - -def test_per_thread_default_stream(): - assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream) - - -def test_default_stream(): - stream = default_stream() - assert isinstance(stream, Stream) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, Stream, StreamOptions +from cuda.core.experimental._event import Event +from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream + + +def test_stream_init(): + with pytest.raises(NotImplementedError): + Stream() + + +def test_stream_init_with_options(init_cuda): + stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0)) + assert stream.is_nonblocking is True + assert stream.priority == 0 + + +def test_stream_handle(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + assert isinstance(stream.handle, int) + + +def test_stream_is_nonblocking(init_cuda): + stream = Device().create_stream(options=StreamOptions(nonblocking=True)) + assert stream.is_nonblocking is True + + +def test_stream_priority(init_cuda): + stream = Device().create_stream(options=StreamOptions(priority=0)) + assert stream.priority == 0 + stream = Device().create_stream(options=StreamOptions(priority=-1)) + assert stream.priority == -1 + with pytest.raises(ValueError): + stream = Device().create_stream(options=StreamOptions(priority=1)) + + +def test_stream_sync(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + stream.sync() # Should not raise any exceptions + + +def test_stream_record(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + event = stream.record() + assert isinstance(event, Event) + + +def test_stream_record_invalid_event(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + with pytest.raises(TypeError): + stream.record(event="invalid_event") + + +def test_stream_wait_event(init_cuda): + s1 = Device().create_stream() + s2 = Device().create_stream() + e1 = s1.record() + s2.wait(e1) # Should not raise any exceptions + s2.sync() + + +def test_stream_wait_invalid_event(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + with pytest.raises(ValueError): + stream.wait(event_or_stream="invalid_event") + + +def test_stream_device(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + device = stream.device + assert isinstance(device, Device) + + +def test_stream_context(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + context = stream.context + assert context is not None + + +def test_stream_from_foreign_stream(init_cuda): + device = Device() + other_stream = device.create_stream(options=StreamOptions()) + stream = device.create_stream(obj=other_stream) + assert other_stream.handle == stream.handle + device = stream.device + assert isinstance(device, Device) + context = stream.context + assert context is not None + + +def test_stream_from_handle(): + stream = Stream.from_handle(0) + assert isinstance(stream, Stream) + + +def test_legacy_default_stream(): + assert isinstance(LEGACY_DEFAULT_STREAM, Stream) + + +def test_per_thread_default_stream(): + assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream) + + +def test_default_stream(): + stream = default_stream() + assert isinstance(stream, Stream) From 702fbaa550f1b40f14fa35a656bcfc5817b96ff9 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 11:54:23 -0800 Subject: [PATCH 051/111] handle culink and nvjitlink differences in the backend and test --- cuda_core/cuda/core/experimental/_linker.py | 50 ++++++++------------ cuda_core/tests/test_linker.py | 51 +++++++++++++++------ 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 79328583..39d6cd27 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -29,7 +29,6 @@ def _lazy_init(): _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: - raise ImportError from cuda.bindings import nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: @@ -247,7 +246,7 @@ def _init_nvjitlink(self): self.formatted_options.append(f"-split-compile={self.split_compile}") if self.split_compile_extended is not None: self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.no_cache is not None: + if self.no_cache is True: self.formatted_options.append("-no-cache") def _init_driver(self): @@ -272,57 +271,46 @@ def _init_driver(self): self.formatted_options.append(self.max_register_count) self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) if self.time is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME) + raise ValueError("time option is not supported by the driver API") if self.verbose is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) if self.link_time_optimization is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) if self.ptx is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + raise ValueError("ptx option is not supported by the driver API") if self.optimization_level is not None: self.formatted_options.append(self.optimization_level) self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) if self.debug is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) if self.lineinfo is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) if self.ftz is not None: - self.formatted_options.append(1 if self.ftz else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ) + raise ValueError("ftz option is deprecated in the driver API") if self.prec_div is not None: - self.formatted_options.append(1 if self.prec_div else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV) + raise ValueError("prec_div option is deprecated in the driver API") if self.prec_sqrt is not None: - self.formatted_options.append(1 if self.prec_sqrt else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT) + raise ValueError("prec_sqrt option is deprecated in the driver API") if self.fma is not None: - self.formatted_options.append(1 if self.fma else 0) - self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA) + raise ValueError("fma options is deprecated in the driver API") if self.kernels_used is not None: - for kernel in self.kernels_used: - self.formatted_options.append(kernel.encode()) - self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES) + raise ValueError("kernels_used is deprecated in the driver API") if self.variables_used is not None: - for variable in self.variables_used: - self.formatted_options.append(variable.encode()) - self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES) + raise ValueError("variables_used is deprecated in the driver API") if self.optimize_unused_variables is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES) + raise ValueError("optimize_unused_variables is deprecated in the driver API") if self.xptxas is not None: - for opt in self.xptxas: - raise NotImplementedError("TODO: implement xptxas option") + raise ValueError("xptxas option is not supported by the driver API") + if self.split_compile is not None: + raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: - self.formatted_options.append(self.split_compile_extended) - self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM) + raise ValueError("split_compile_extended option is not supported by the driver API") if self.no_cache is not None: - self.formatted_options.append(1) # ctypes.c_int32(1) + self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 3937c878..db9ff657 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -8,6 +8,17 @@ basic_kernel = "__device__ int B() { return 0; }" addition_kernel = "__device__ int C(int a, int b) { return a + b; }" +try: + from cuda.bindings import nvjitlink # noqa F401 + from cuda.bindings._internal import nvjitlink as inner_nvjitlink +except ImportError: + # binding is not available + culink_backend = True +else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + culink_backend = True + @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): @@ -27,27 +38,36 @@ def compile_ltoir_functions(init_cuda): return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir +culink_options = [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, no_cache=True), +] + + @pytest.mark.parametrize( "options", - [ - LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), + culink_options + if culink_backend + else culink_options + + [ LinkerOptions(arch=ARCH, time=True), - LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), LinkerOptions(arch=ARCH, ftz=True), LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), - # LinkerOptions(arch=ARCH, xptxas=["-v"]), - # LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), LinkerOptions(arch=ARCH, split_compile_extended=1), - # LinkerOptions(arch=ARCH, no_cache=True), ], ) def test_linker_init(compile_ptx_functions, options): @@ -62,11 +82,12 @@ def test_linker_init_invalid_arch(): Linker(options) -# def test_linker_link_ptx(compile_ltoir_functions): -# options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) -# linker = Linker(*compile_ltoir_functions, options=options) -# linked_code = linker.link("ptx") -# assert isinstance(linked_code, ObjectCode) +@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) def test_linker_link_cubin(compile_ptx_functions): From 996ab39a58d1e9495e9ba946527164879fc648f8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 13:45:05 -0800 Subject: [PATCH 052/111] update line endings --- cuda_core/cuda/core/experimental/_linker.py | 888 ++++++++++---------- cuda_core/tests/test_linker.py | 240 +++--- 2 files changed, 564 insertions(+), 564 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 39d6cd27..7d95d371 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -1,444 +1,444 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import ctypes -import weakref -from dataclasses import dataclass -from typing import List, Optional - -from cuda import cuda -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import check_or_create_options, handle_return - -# TODO: revisit this treatment for py313t builds -_driver = None # populated if nvJitLink cannot be used -_driver_input_types = None # populated if nvJitLink cannot be used -_driver_ver = None -_inited = False -_nvjitlink = None # populated if nvJitLink can be used -_nvjitlink_input_types = None # populated if nvJitLink cannot be used - - -def _lazy_init(): - global _inited - if _inited: - return - - global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types - _driver_ver = handle_return(cuda.cuDriverGetVersion()) - _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) - try: - from cuda.bindings import nvjitlink - from cuda.bindings._internal import nvjitlink as inner_nvjitlink - except ImportError: - # binding is not available - nvjitlink = None - else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - nvjitlink = None - elif _driver_ver > nvjitlink.version(): - # TODO: nvJitLink is not new enough, warn? - pass - if nvjitlink: - _nvjitlink = nvjitlink - _nvjitlink_input_types = { - "ptx": _nvjitlink.InputType.PTX, - "cubin": _nvjitlink.InputType.CUBIN, - "fatbin": _nvjitlink.InputType.FATBIN, - "ltoir": _nvjitlink.InputType.LTOIR, - "object": _nvjitlink.InputType.OBJECT, - } - else: - from cuda import cuda as _driver - - _driver_input_types = { - "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, - "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, - "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, - "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, - } - _inited = True - - -@dataclass -class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink. - - Attributes - ---------- - arch : str - Pass SM architecture value. Can use compute_ value instead if only generating PTX. - This is a required option. - Acceptable value type: str - Maps to: -arch=sm_ - max_register_count : int, optional - Maximum register count. - Default: None - Acceptable value type: int - Maps to: -maxrregcount= - time : bool, optional - Print timing information to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -time - verbose : bool, optional - Print verbose messages to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -verbose - link_time_optimization : bool, optional - Perform link time optimization. - Default: False - Acceptable value type: bool - Maps to: -lto - ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with -lto. - Default: False - Acceptable value type: bool - Maps to: -ptx - optimization_level : int, optional - Set optimization level. Only 0 and 3 are accepted. - Default: None - Acceptable value type: int - Maps to: -O - debug : bool, optional - Generate debug information. - Default: False - Acceptable value type: bool - Maps to: -g - lineinfo : bool, optional - Generate line information. - Default: False - Acceptable value type: bool - Maps to: -lineinfo - ftz : bool, optional - Flush denormal values to zero. - Default: False - Acceptable value type: bool - Maps to: -ftz= - prec_div : bool, optional - Use precise division. - Default: True - Acceptable value type: bool - Maps to: -prec-div= - prec_sqrt : bool, optional - Use precise square root. - Default: True - Acceptable value type: bool - Maps to: -prec-sqrt= - fma : bool, optional - Use fast multiply-add. - Default: True - Acceptable value type: bool - Maps to: -fma= - kernels_used : List[str], optional - Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -kernels-used= - variables_used : List[str], optional - Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -variables-used= - optimize_unused_variables : bool, optional - Assume that if a variable is not referenced in device code, it can be removed. - Default: False - Acceptable value type: bool - Maps to: -optimize-unused-variables - xptxas : List[str], optional - Pass options to PTXAS. This option can be called multiple times. - Default: None - Acceptable value type: list of str - Maps to: -Xptxas= - split_compile : int, optional - Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split - compilation (default). - Default: 1 - Acceptable value type: int - Maps to: -split-compile= - split_compile_extended : int, optional - A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. - Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This - option can potentially impact performance of the compiled binary. - Default: 1 - Acceptable value type: int - Maps to: -split-compile-extended= - no_cache : bool, optional - Do not cache the intermediate steps of nvJitLink. - Default: False - Acceptable value type: bool - Maps to: -no-cache - """ - - arch: str - max_register_count: Optional[int] = None - time: Optional[bool] = None - verbose: Optional[bool] = None - link_time_optimization: Optional[bool] = None - ptx: Optional[bool] = None - optimization_level: Optional[int] = None - debug: Optional[bool] = None - lineinfo: Optional[bool] = None - ftz: Optional[bool] = None - prec_div: Optional[bool] = None - prec_sqrt: Optional[bool] = None - fma: Optional[bool] = None - kernels_used: Optional[List[str]] = None - variables_used: Optional[List[str]] = None - optimize_unused_variables: Optional[bool] = None - xptxas: Optional[List[str]] = None - split_compile: Optional[int] = None - split_compile_extended: Optional[int] = None - no_cache: Optional[bool] = None - - def __post_init__(self): - _lazy_init() - self.formatted_options = [] - if _nvjitlink: - self._init_nvjitlink() - else: - self._init_driver() - - def _init_nvjitlink(self): - if self.arch is not None: - self.formatted_options.append(f"-arch={self.arch}") - if self.max_register_count is not None: - self.formatted_options.append(f"-maxrregcount={self.max_register_count}") - if self.time is not None: - self.formatted_options.append("-time") - if self.verbose is not None: - self.formatted_options.append("-verbose") - if self.link_time_optimization is not None: - self.formatted_options.append("-lto") - if self.ptx is not None: - self.formatted_options.append("-ptx") - if self.optimization_level is not None: - self.formatted_options.append(f"-O{self.optimization_level}") - if self.debug is not None: - self.formatted_options.append("-g") - if self.lineinfo is not None: - self.formatted_options.append("-lineinfo") - if self.ftz is not None: - self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") - if self.prec_div is not None: - self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") - if self.prec_sqrt is not None: - self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") - if self.fma is not None: - self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") - if self.kernels_used is not None: - for kernel in self.kernels_used: - self.formatted_options.append(f"-kernels-used={kernel}") - if self.variables_used is not None: - for variable in self.variables_used: - self.formatted_options.append(f"-variables-used={variable}") - if self.optimize_unused_variables is not None: - self.formatted_options.append("-optimize-unused-variables") - if self.xptxas is not None: - for opt in self.xptxas: - self.formatted_options.append(f"-Xptxas={opt}") - if self.split_compile is not None: - self.formatted_options.append(f"-split-compile={self.split_compile}") - if self.split_compile_extended is not None: - self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") - if self.no_cache is True: - self.formatted_options.append("-no-cache") - - def _init_driver(self): - self.option_keys = [] - # allocate 4 KiB each for info/error logs - size = 4194304 - self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) - self.option_keys.extend( - ( - _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, - _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, - _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - ) - ) - - if self.arch is not None: - arch = self.arch.split("_")[-1].upper() - self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) - self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) - if self.max_register_count is not None: - self.formatted_options.append(self.max_register_count) - self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) - if self.time is not None: - raise ValueError("time option is not supported by the driver API") - if self.verbose is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) - if self.link_time_optimization is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) - if self.ptx is not None: - raise ValueError("ptx option is not supported by the driver API") - if self.optimization_level is not None: - self.formatted_options.append(self.optimization_level) - self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) - if self.debug is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) - if self.lineinfo is not None: - self.formatted_options.append(1) - self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) - if self.ftz is not None: - raise ValueError("ftz option is deprecated in the driver API") - if self.prec_div is not None: - raise ValueError("prec_div option is deprecated in the driver API") - if self.prec_sqrt is not None: - raise ValueError("prec_sqrt option is deprecated in the driver API") - if self.fma is not None: - raise ValueError("fma options is deprecated in the driver API") - if self.kernels_used is not None: - raise ValueError("kernels_used is deprecated in the driver API") - if self.variables_used is not None: - raise ValueError("variables_used is deprecated in the driver API") - if self.optimize_unused_variables is not None: - raise ValueError("optimize_unused_variables is deprecated in the driver API") - if self.xptxas is not None: - raise ValueError("xptxas option is not supported by the driver API") - if self.split_compile is not None: - raise ValueError("split_compile option is not supported by the driver API") - if self.split_compile_extended is not None: - raise ValueError("split_compile_extended option is not supported by the driver API") - if self.no_cache is not None: - self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) - self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) - - -class Linker: - """ - Linker class for managing the linking of object codes with specified options. - - Parameters - ---------- - object_codes : ObjectCode - One or more ObjectCode objects to be linked. - options : LinkerOptions, optional - Options for the linker. If not provided, default options will be used. - """ - - class _MembersNeededForFinalize: - __slots__ = ("handle", "use_nvjitlink") - - def __init__(self, program_obj, handle, use_nvjitlink): - self.handle = handle - self.use_nvjitlink = use_nvjitlink - weakref.finalize(program_obj, self.close) - - def close(self): - if self.handle is not None: - if self.use_nvjitlink: - _nvjitlink.destroy(self.handle) - else: - handle_return(_driver.cuLinkDestroy(self.handle)) - self.handle = None - - __slots__ = ("__weakref__", "_mnff", "_options") - - def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - if len(object_codes) == 0: - raise ValueError("At least one ObjectCode object must be provided") - - self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - if _nvjitlink: - handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) - use_nvjitlink = True - else: - handle = handle_return( - _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) - ) - use_nvjitlink = False - self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) - - for code in object_codes: - assert isinstance(code, ObjectCode) - self._add_code_object(code) - - def _add_code_object(self, object_code: ObjectCode): - data = object_code._module - assert isinstance(data, bytes) - if _nvjitlink: - _nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - else: - handle_return( - _driver.cuLinkAddData( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}".encode(), - 0, - None, - None, - ) - ) - - def link(self, target_type) -> ObjectCode: - if target_type not in ("cubin", "ptx"): - raise ValueError(f"Unsupported target type: {target_type}") - if _nvjitlink: - _nvjitlink.complete(self._mnff.handle) - if target_type == "cubin": - get_size = _nvjitlink.get_linked_cubin_size - get_code = _nvjitlink.get_linked_cubin - else: - get_size = _nvjitlink.get_linked_ptx_size - get_code = _nvjitlink.get_linked_ptx - - size = get_size(self._mnff.handle) - code = bytearray(size) - get_code(self._mnff.handle, code) - else: - addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) - code = (ctypes.c_char * size).from_address(addr) - - return ObjectCode(bytes(code), target_type) - - def get_error_log(self) -> str: - if _nvjitlink: - log_size = _nvjitlink.get_error_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_error_log(self._mnff.handle, log) - else: - log = self._options.formatted_options[2] - return log.decode() - - def get_info_log(self) -> str: - if _nvjitlink: - log_size = _nvjitlink.get_info_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_info_log(self._mnff.handle, log) - else: - log = self._options.formatted_options[0] - return log.decode() - - def _input_type_from_code_type(self, code_type: str): - # this list is based on the supported values for code_type in the ObjectCode class definition. - # nvJitLink/driver support other options for input type - input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) - - if input_type is None: - raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") - return input_type - - @property - def handle(self) -> int: - return self._mnff.handle - - def close(self): - self._mnff.close() +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import ctypes +import weakref +from dataclasses import dataclass +from typing import List, Optional + +from cuda import cuda +from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._utils import check_or_create_options, handle_return + +# TODO: revisit this treatment for py313t builds +_driver = None # populated if nvJitLink cannot be used +_driver_input_types = None # populated if nvJitLink cannot be used +_driver_ver = None +_inited = False +_nvjitlink = None # populated if nvJitLink can be used +_nvjitlink_input_types = None # populated if nvJitLink cannot be used + + +def _lazy_init(): + global _inited + if _inited: + return + + global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) + try: + from cuda.bindings import nvjitlink + from cuda.bindings._internal import nvjitlink as inner_nvjitlink + except ImportError: + # binding is not available + nvjitlink = None + else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + nvjitlink = None + elif _driver_ver > nvjitlink.version(): + # TODO: nvJitLink is not new enough, warn? + pass + if nvjitlink: + _nvjitlink = nvjitlink + _nvjitlink_input_types = { + "ptx": _nvjitlink.InputType.PTX, + "cubin": _nvjitlink.InputType.CUBIN, + "fatbin": _nvjitlink.InputType.FATBIN, + "ltoir": _nvjitlink.InputType.LTOIR, + "object": _nvjitlink.InputType.OBJECT, + } + else: + from cuda import cuda as _driver + + _driver_input_types = { + "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, + "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, + "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, + } + _inited = True + + +@dataclass +class LinkerOptions: + """Customizable :obj:`LinkerOptions` for nvJitLink. + + Attributes + ---------- + arch : str + Pass SM architecture value. Can use compute_ value instead if only generating PTX. + This is a required option. + Acceptable value type: str + Maps to: -arch=sm_ + max_register_count : int, optional + Maximum register count. + Default: None + Acceptable value type: int + Maps to: -maxrregcount= + time : bool, optional + Print timing information to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -time + verbose : bool, optional + Print verbose messages to InfoLog. + Default: False + Acceptable value type: bool + Maps to: -verbose + link_time_optimization : bool, optional + Perform link time optimization. + Default: False + Acceptable value type: bool + Maps to: -lto + ptx : bool, optional + Emit PTX after linking instead of CUBIN; only supported with -lto. + Default: False + Acceptable value type: bool + Maps to: -ptx + optimization_level : int, optional + Set optimization level. Only 0 and 3 are accepted. + Default: None + Acceptable value type: int + Maps to: -O + debug : bool, optional + Generate debug information. + Default: False + Acceptable value type: bool + Maps to: -g + lineinfo : bool, optional + Generate line information. + Default: False + Acceptable value type: bool + Maps to: -lineinfo + ftz : bool, optional + Flush denormal values to zero. + Default: False + Acceptable value type: bool + Maps to: -ftz= + prec_div : bool, optional + Use precise division. + Default: True + Acceptable value type: bool + Maps to: -prec-div= + prec_sqrt : bool, optional + Use precise square root. + Default: True + Acceptable value type: bool + Maps to: -prec-sqrt= + fma : bool, optional + Use fast multiply-add. + Default: True + Acceptable value type: bool + Maps to: -fma= + kernels_used : List[str], optional + Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -kernels-used= + variables_used : List[str], optional + Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple + times. + Default: None + Acceptable value type: list of str + Maps to: -variables-used= + optimize_unused_variables : bool, optional + Assume that if a variable is not referenced in device code, it can be removed. + Default: False + Acceptable value type: bool + Maps to: -optimize-unused-variables + xptxas : List[str], optional + Pass options to PTXAS. This option can be called multiple times. + Default: None + Acceptable value type: list of str + Maps to: -Xptxas= + split_compile : int, optional + Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split + compilation (default). + Default: 1 + Acceptable value type: int + Maps to: -split-compile= + split_compile_extended : int, optional + A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. + Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This + option can potentially impact performance of the compiled binary. + Default: 1 + Acceptable value type: int + Maps to: -split-compile-extended= + no_cache : bool, optional + Do not cache the intermediate steps of nvJitLink. + Default: False + Acceptable value type: bool + Maps to: -no-cache + """ + + arch: str + max_register_count: Optional[int] = None + time: Optional[bool] = None + verbose: Optional[bool] = None + link_time_optimization: Optional[bool] = None + ptx: Optional[bool] = None + optimization_level: Optional[int] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + ftz: Optional[bool] = None + prec_div: Optional[bool] = None + prec_sqrt: Optional[bool] = None + fma: Optional[bool] = None + kernels_used: Optional[List[str]] = None + variables_used: Optional[List[str]] = None + optimize_unused_variables: Optional[bool] = None + xptxas: Optional[List[str]] = None + split_compile: Optional[int] = None + split_compile_extended: Optional[int] = None + no_cache: Optional[bool] = None + + def __post_init__(self): + _lazy_init() + self.formatted_options = [] + if _nvjitlink: + self._init_nvjitlink() + else: + self._init_driver() + + def _init_nvjitlink(self): + if self.arch is not None: + self.formatted_options.append(f"-arch={self.arch}") + if self.max_register_count is not None: + self.formatted_options.append(f"-maxrregcount={self.max_register_count}") + if self.time is not None: + self.formatted_options.append("-time") + if self.verbose is not None: + self.formatted_options.append("-verbose") + if self.link_time_optimization is not None: + self.formatted_options.append("-lto") + if self.ptx is not None: + self.formatted_options.append("-ptx") + if self.optimization_level is not None: + self.formatted_options.append(f"-O{self.optimization_level}") + if self.debug is not None: + self.formatted_options.append("-g") + if self.lineinfo is not None: + self.formatted_options.append("-lineinfo") + if self.ftz is not None: + self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") + if self.prec_div is not None: + self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") + if self.prec_sqrt is not None: + self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") + if self.fma is not None: + self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") + if self.kernels_used is not None: + for kernel in self.kernels_used: + self.formatted_options.append(f"-kernels-used={kernel}") + if self.variables_used is not None: + for variable in self.variables_used: + self.formatted_options.append(f"-variables-used={variable}") + if self.optimize_unused_variables is not None: + self.formatted_options.append("-optimize-unused-variables") + if self.xptxas is not None: + for opt in self.xptxas: + self.formatted_options.append(f"-Xptxas={opt}") + if self.split_compile is not None: + self.formatted_options.append(f"-split-compile={self.split_compile}") + if self.split_compile_extended is not None: + self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") + if self.no_cache is True: + self.formatted_options.append("-no-cache") + + def _init_driver(self): + self.option_keys = [] + # allocate 4 KiB each for info/error logs + size = 4194304 + self.formatted_options.extend((bytearray(size), size, bytearray(size), size)) + self.option_keys.extend( + ( + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, + _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + ) + ) + + if self.arch is not None: + arch = self.arch.split("_")[-1].upper() + self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}")) + self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET) + if self.max_register_count is not None: + self.formatted_options.append(self.max_register_count) + self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS) + if self.time is not None: + raise ValueError("time option is not supported by the driver API") + if self.verbose is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE) + if self.link_time_optimization is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO) + if self.ptx is not None: + raise ValueError("ptx option is not supported by the driver API") + if self.optimization_level is not None: + self.formatted_options.append(self.optimization_level) + self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL) + if self.debug is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO) + if self.lineinfo is not None: + self.formatted_options.append(1) + self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) + if self.ftz is not None: + raise ValueError("ftz option is deprecated in the driver API") + if self.prec_div is not None: + raise ValueError("prec_div option is deprecated in the driver API") + if self.prec_sqrt is not None: + raise ValueError("prec_sqrt option is deprecated in the driver API") + if self.fma is not None: + raise ValueError("fma options is deprecated in the driver API") + if self.kernels_used is not None: + raise ValueError("kernels_used is deprecated in the driver API") + if self.variables_used is not None: + raise ValueError("variables_used is deprecated in the driver API") + if self.optimize_unused_variables is not None: + raise ValueError("optimize_unused_variables is deprecated in the driver API") + if self.xptxas is not None: + raise ValueError("xptxas option is not supported by the driver API") + if self.split_compile is not None: + raise ValueError("split_compile option is not supported by the driver API") + if self.split_compile_extended is not None: + raise ValueError("split_compile_extended option is not supported by the driver API") + if self.no_cache is not None: + self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) + self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) + + +class Linker: + """ + Linker class for managing the linking of object codes with specified options. + + Parameters + ---------- + object_codes : ObjectCode + One or more ObjectCode objects to be linked. + options : LinkerOptions, optional + Options for the linker. If not provided, default options will be used. + """ + + class _MembersNeededForFinalize: + __slots__ = ("handle", "use_nvjitlink") + + def __init__(self, program_obj, handle, use_nvjitlink): + self.handle = handle + self.use_nvjitlink = use_nvjitlink + weakref.finalize(program_obj, self.close) + + def close(self): + if self.handle is not None: + if self.use_nvjitlink: + _nvjitlink.destroy(self.handle) + else: + handle_return(_driver.cuLinkDestroy(self.handle)) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_options") + + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): + if len(object_codes) == 0: + raise ValueError("At least one ObjectCode object must be provided") + + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) + use_nvjitlink = False + self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) + + for code in object_codes: + assert isinstance(code, ObjectCode) + self._add_code_object(code) + + def _add_code_object(self, object_code: ObjectCode): + data = object_code._module + assert isinstance(data, bytes) + if _nvjitlink: + _nvjitlink.add_data( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) + ) + + def link(self, target_type) -> ObjectCode: + if target_type not in ("cubin", "ptx"): + raise ValueError(f"Unsupported target type: {target_type}") + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx + + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) + else: + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) + + return ObjectCode(bytes(code), target_type) + + def get_error_log(self) -> str: + if _nvjitlink: + log_size = _nvjitlink.get_error_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_error_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[2] + return log.decode() + + def get_info_log(self) -> str: + if _nvjitlink: + log_size = _nvjitlink.get_info_log_size(self._mnff.handle) + log = bytearray(log_size) + _nvjitlink.get_info_log(self._mnff.handle, log) + else: + log = self._options.formatted_options[0] + return log.decode() + + def _input_type_from_code_type(self, code_type: str): + # this list is based on the supported values for code_type in the ObjectCode class definition. + # nvJitLink/driver support other options for input type + input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) + + if input_type is None: + raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") + return input_type + + @property + def handle(self) -> int: + return self._mnff.handle + + def close(self): + self._mnff.close() diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index db9ff657..15496b59 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,120 +1,120 @@ -import pytest - -from cuda.core.experimental import Linker, LinkerOptions, Program -from cuda.core.experimental._module import ObjectCode - -ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_kernel = "__device__ void A() {}" -basic_kernel = "__device__ int B() { return 0; }" -addition_kernel = "__device__ int C(int a, int b) { return a + b; }" - -try: - from cuda.bindings import nvjitlink # noqa F401 - from cuda.bindings._internal import nvjitlink as inner_nvjitlink -except ImportError: - # binding is not available - culink_backend = True -else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - culink_backend = True - - -@pytest.fixture(scope="function") -def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") - object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") - - return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx - - -@pytest.fixture(scope="function") -def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) - - return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir - - -culink_options = [ - LinkerOptions(arch=ARCH), - LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, verbose=True), - LinkerOptions(arch=ARCH, optimization_level=3), - LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, no_cache=True), -] - - -@pytest.mark.parametrize( - "options", - culink_options - if culink_backend - else culink_options - + [ - LinkerOptions(arch=ARCH, time=True), - LinkerOptions(arch=ARCH, ftz=True), - LinkerOptions(arch=ARCH, prec_div=True), - LinkerOptions(arch=ARCH, prec_sqrt=True), - LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), - LinkerOptions(arch=ARCH, variables_used=["var1"]), - LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), - ], -) -def test_linker_init(compile_ptx_functions, options): - linker = Linker(*compile_ptx_functions, options=options) - object_code = linker.link("cubin") - assert isinstance(object_code, ObjectCode) - - -def test_linker_init_invalid_arch(): - options = LinkerOptions(arch=None) - with pytest.raises(TypeError): - Linker(options) - - -@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") -def test_linker_link_ptx(compile_ltoir_functions): - options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) - linker = Linker(*compile_ltoir_functions, options=options) - linked_code = linker.link("ptx") - assert isinstance(linked_code, ObjectCode) - - -def test_linker_link_cubin(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linked_code = linker.link("cubin") - assert isinstance(linked_code, ObjectCode) - - -def test_linker_link_invalid_target_type(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - with pytest.raises(ValueError): - linker.link("invalid_target") - - -def test_linker_get_error_log(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linker.link("cubin") - log = linker.get_error_log() - assert isinstance(log, str) - - -def test_linker_get_info_log(compile_ptx_functions): - options = LinkerOptions(arch=ARCH) - linker = Linker(*compile_ptx_functions, options=options) - linker.link("cubin") - log = linker.get_info_log() - assert isinstance(log, str) +import pytest + +from cuda.core.experimental import Linker, LinkerOptions, Program +from cuda.core.experimental._module import ObjectCode + +ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +empty_kernel = "__device__ void A() {}" +basic_kernel = "__device__ int B() { return 0; }" +addition_kernel = "__device__ int C(int a, int b) { return a + b; }" + +try: + from cuda.bindings import nvjitlink # noqa F401 + from cuda.bindings._internal import nvjitlink as inner_nvjitlink +except ImportError: + # binding is not available + culink_backend = True +else: + if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: + # binding is available, but nvJitLink is not installed + culink_backend = True + + +@pytest.fixture(scope="function") +def compile_ptx_functions(init_cuda): + object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") + object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + + return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx + + +@pytest.fixture(scope="function") +def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + + return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir + + +culink_options = [ + LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, max_register_count=32), + LinkerOptions(arch=ARCH, verbose=True), + LinkerOptions(arch=ARCH, optimization_level=3), + LinkerOptions(arch=ARCH, debug=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, no_cache=True), +] + + +@pytest.mark.parametrize( + "options", + culink_options + if culink_backend + else culink_options + + [ + LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, ftz=True), + LinkerOptions(arch=ARCH, prec_div=True), + LinkerOptions(arch=ARCH, prec_sqrt=True), + LinkerOptions(arch=ARCH, fma=True), + LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), + LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), + LinkerOptions(arch=ARCH, variables_used=["var1"]), + LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + ], +) +def test_linker_init(compile_ptx_functions, options): + linker = Linker(*compile_ptx_functions, options=options) + object_code = linker.link("cubin") + assert isinstance(object_code, ObjectCode) + + +def test_linker_init_invalid_arch(): + options = LinkerOptions(arch=None) + with pytest.raises(TypeError): + Linker(options) + + +@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") +def test_linker_link_ptx(compile_ltoir_functions): + options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) + linker = Linker(*compile_ltoir_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_cubin(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("cubin") + assert isinstance(linked_code, ObjectCode) + + +def test_linker_link_invalid_target_type(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + with pytest.raises(ValueError): + linker.link("invalid_target") + + +def test_linker_get_error_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_error_log() + assert isinstance(log, str) + + +def test_linker_get_info_log(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linker.link("cubin") + log = linker.get_info_log() + assert isinstance(log, str) From 8ed625615a6bb6cd700b818620ab923c9a72ce38 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 14:59:17 -0800 Subject: [PATCH 053/111] update the test --- cuda_core/cuda/core/experimental/_linker.py | 2 +- cuda_core/docs/source/api.rst | 3 -- cuda_core/tests/test_linker.py | 39 ++++++++++++++------- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 7d95d371..bf232cad 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -309,7 +309,7 @@ def _init_driver(self): raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: raise ValueError("split_compile_extended option is not supported by the driver API") - if self.no_cache is not None: + if self.no_cache is True: self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE) self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index a6a34e40..c3e66b52 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,13 +31,11 @@ CUDA compilation toolchain :toctree: generated/ Program -<<<<<<< HEAD Linker :template: dataclass.rst LinkerOptions -======= .. module:: cuda.core.experimental.utils @@ -53,4 +51,3 @@ Utility functions :template: dataclass.rst StridedMemoryView ->>>>>>> origin/main diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 15496b59..f5dc33dd 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,10 +4,15 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper -empty_kernel = "__device__ void A() {}" -basic_kernel = "__device__ int B() { return 0; }" -addition_kernel = "__device__ int C(int a, int b) { return a + b; }" - +device_function_a = """ +__device__ int B(); +__device__ int C(int a, int b); +__device__ void A() { int result = C(B(), 1);} +""" +device_function_b = "__device__ int B() { return 0; }" +device_function_c = "__device__ int C(int a, int b) { return a + b; }" + +culink_backend = False try: from cuda.bindings import nvjitlink # noqa F401 from cuda.bindings._internal import nvjitlink as inner_nvjitlink @@ -22,18 +27,18 @@ @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx") - object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx") - object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx") + object_code_b_ptx = Program(device_function_b, "c++").compile("ptx") + object_code_c_ptx = Program(device_function_c, "c++").compile("ptx") + object_code_a_ptx = Program(device_function_a, "c++").compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",)) + object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) + object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir @@ -60,8 +65,8 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), LinkerOptions(arch=ARCH, fma=True), - LinkerOptions(arch=ARCH, kernels_used=["kernel1"]), - LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]), + LinkerOptions(arch=ARCH, kernels_used=["A"]), + LinkerOptions(arch=ARCH, kernels_used=["C", "B"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), LinkerOptions(arch=ARCH, optimize_unused_variables=True), @@ -83,13 +88,21 @@ def test_linker_init_invalid_arch(): @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") -def test_linker_link_ptx(compile_ltoir_functions): +def test_linker_link_ptx_nvjitlink(compile_ltoir_functions): options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True) linker = Linker(*compile_ltoir_functions, options=options) linked_code = linker.link("ptx") assert isinstance(linked_code, ObjectCode) +@pytest.mark.skipif(not culink_backend, reason="nvjitlink requires lto for ptx linking") +def test_linker_link_ptx_culink(compile_ptx_functions): + options = LinkerOptions(arch=ARCH) + linker = Linker(*compile_ptx_functions, options=options) + linked_code = linker.link("ptx") + assert isinstance(linked_code, ObjectCode) + + def test_linker_link_cubin(compile_ptx_functions): options = LinkerOptions(arch=ARCH) linker = Linker(*compile_ptx_functions, options=options) From 188ae6223fbb2a8dc567551627483b75230149f8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 15:01:38 -0800 Subject: [PATCH 054/111] update the test --- cuda_core/tests/test_linker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index f5dc33dd..b4008ab6 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,6 +4,8 @@ from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper + + device_function_a = """ __device__ int B(); __device__ int C(int a, int b); From 0522d2b71fb682199dfd801f61501aebc030f82e Mon Sep 17 00:00:00 2001 From: ksimpson Date: Wed, 4 Dec 2024 15:10:51 -0800 Subject: [PATCH 055/111] update the documentation to touch on LinkerOptions vs CUDA version --- cuda_core/docs/source/release/0.1.1-notes.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index 6e491a62..29694f4a 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -6,9 +6,13 @@ Released on Nov , 2024 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API. +- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API + depending on your CUDA version. ## Limitations - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedbacks with us so that we can make `cuda.core` better! +- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, + the backend is the cuLink api which supports only a subset of the options that nvjitlink does. + Further, some options aren't available on CUDA versions <12.6 From 06b77e1ef0743fe2e585ef186c5d555a77012132 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Dec 2024 04:20:36 +0000 Subject: [PATCH 056/111] use cpu8 runners for build; remove unnecessary mac condition --- .github/workflows/gh-build-and-test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 430fbf5b..ffc6f959 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -27,9 +27,11 @@ jobs: with: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu16') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || + (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') }} build-type: ${{ inputs.build-type }} - use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}} + use-container: ${{ inputs.host-platform == 'linux-x64' || + inputs.host-platform == 'linux-aarch64'}} host-platform: ${{ inputs.host-platform }} dependencies-file: "" build-mode: ${{ inputs.build-mode }} From 61be96cfce5c64ad0ad96e53fc3bb6e474c136d9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Dec 2024 04:23:16 +0000 Subject: [PATCH 057/111] always require manual CI triggering --- .github/copy-pr-bot.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 895ba83e..2771228b 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -2,3 +2,6 @@ # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ enabled: true +# always require manual CI triggering, ignoring signed commits +auto_sync_draft: false +auto_sync_ready: false From df8cbea9727a7438b9013d2c695a8d6a21560537 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 5 Dec 2024 10:38:26 -0800 Subject: [PATCH 058/111] use rdc for nvrtc compilation and improve exception reporting by dumping the log --- cuda_core/cuda/core/experimental/_linker.py | 45 +++++++++++++++++---- cuda_core/tests/test_linker.py | 12 +++--- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index bf232cad..b6c28ba5 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -64,7 +64,10 @@ def _lazy_init(): @dataclass class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink. + """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available + whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer + or older versions of cuda. + Attributes ---------- @@ -350,11 +353,16 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") if _nvjitlink: - handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + handle = self._exception_manager( + lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options) + ) + use_nvjitlink = True else: - handle = handle_return( - _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + handle = self._exception_manager( + lambda: handle_return( + _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) + ) ) use_nvjitlink = False self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) @@ -363,6 +371,27 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): assert isinstance(code, ObjectCode) self._add_code_object(code) + def _exception_manager(self, action): + """ + Helper function to improve the error message of excepotions raised by the linker backend. + + Parameters + ---------- + action : callable + The action to be performed. + + Returns + ------- + The return value of the action. + """ + try: + return action() + except Exception as e: + error = self.get_error_log() + raise RuntimeError( + f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}" + ) from e + def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -392,7 +421,7 @@ def link(self, target_type) -> ObjectCode: if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") if _nvjitlink: - _nvjitlink.complete(self._mnff.handle) + self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle)) if target_type == "cubin": get_size = _nvjitlink.get_linked_cubin_size get_code = _nvjitlink.get_linked_cubin @@ -400,11 +429,11 @@ def link(self, target_type) -> ObjectCode: get_size = _nvjitlink.get_linked_ptx_size get_code = _nvjitlink.get_linked_ptx - size = get_size(self._mnff.handle) + size = self._exception_manager(lambda: get_size(self._mnff.handle)) code = bytearray(size) - get_code(self._mnff.handle, code) + self._exception_manager(lambda: get_code(self._mnff.handle, code)) else: - addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle))) code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index b4008ab6..6163d9a8 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -9,7 +9,7 @@ device_function_a = """ __device__ int B(); __device__ int C(int a, int b); -__device__ void A() { int result = C(B(), 1);} +__global__ void A() { int result = C(B(), 1);} """ device_function_b = "__device__ int B() { return 0; }" device_function_c = "__device__ int C(int a, int b) { return a + b; }" @@ -29,9 +29,11 @@ @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - object_code_b_ptx = Program(device_function_b, "c++").compile("ptx") - object_code_c_ptx = Program(device_function_c, "c++").compile("ptx") - object_code_a_ptx = Program(device_function_a, "c++").compile("ptx") + # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced + # device functions, causing the link to fail + object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) + object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) + object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",)) return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @@ -46,7 +48,7 @@ def compile_ltoir_functions(init_cuda): culink_options = [ - LinkerOptions(arch=ARCH), + LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, optimization_level=3), From 761bea0b83252519d986d8111e88cad5edaebfcf Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 5 Dec 2024 10:52:32 -0800 Subject: [PATCH 059/111] add note to link() --- cuda_core/cuda/core/experimental/_linker.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index b6c28ba5..01c4a0e9 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -418,6 +418,24 @@ def _add_code_object(self, object_code: ObjectCode): ) def link(self, target_type) -> ObjectCode: + """ + Links the provided object codes into a single output of the specified target type. + + Parameters + ---------- + target_type : str + The type of the target output. Must be either "cubin" or "ptx". + + Returns + ------- + ObjectCode + The linked object code of the specified target type. + + Note + ------ + See nvrtc compiler options documnetation to ensure the input ObjectCodes are + correctly compiled for linking. + """ if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") if _nvjitlink: From b6d73c8ef4efb9ceb97c52f73cc4fb0a60d910c6 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Fri, 6 Dec 2024 11:13:14 -0800 Subject: [PATCH 060/111] Update cuda_core/cuda/core/experimental/_memory.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/experimental/_memory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 5ff00ba2..12fafb39 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -307,6 +307,9 @@ def allocate(self, size, stream=None) -> Buffer: return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + stream.sync() handle_return(cuda.cuMemFree(ptr)) @property From 1c86afa7bc49a33117f6a0c35c1b9fdb951f3943 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Dec 2024 16:43:23 -0500 Subject: [PATCH 061/111] Full CI support for public builds + switch to use cibuildwheel (#267) * switch to cibuildwheel + some cleanups * try setting up Python manually * comment out un-needed code + propagate python-version * fix: need to check out first * only build natively; add -v; ensure targeting manylinux * single quotes * restore env setup to get artifact dir * fix artifact dir * fix artifact name * restore & fix artifact name again * build on all platforms! * fix CIBW_BUILD for windows * fix typo * no quotes for wildcard matching * move CIBW_BUILD logic to script * fix win runner name * try to find where pwsh is * try to escape * continue hunting.. * try to overwrite shell * try to install ps * be explicit about shell (why?) * only build for win 64 bits * try to install msvc * install msvc ourselves * fix typo * skip custom cl ver check * install to standard location * try to locate Python include path * switch to public windows runner for now * windows image does not have sudo * pwd on Windows Bash does not use Windows path format * cover all Python versions! * add quotes * align the python version installed via GHA vs used at build time * fix constraint syntax * check if setup-python is causing interference * fix typo * apply a WAR on Linux * fix unbound var * detect Python path after it's installed (by CIBW) * try CIBW_BEFORE_ALL_LINUX * try to restore the pre-py-span setup... * reduce build matrix to experiment with cuda.bindings builds * fix parenthesis * use abs path * defer CUDA_PATH setting * use CIBW_ENVIRONMENT to pass env var * fetch cuda-profiler-api headers * only rely on redist * allow wheel repair to fix the triplet tags * restore full build matrix! * fix wget on Windows; pass PARALLEL_LEVEL to CIBW * switch from wget to curl * windows needs unzip not tar * mv -> rsync * git bash has no wget or rsync... * ensure win-style path on win * convert abs path * debug * another level down * check if it is a race condition on win * clean up unused (but still relevant) code * clean up unused (but still relevant) code - cont'd * consolidate with PYTHON_VERSION_FORMATTED --- .github/actions/build/action.yml | 108 ++++++++++++----------- .github/actions/setup/action.yml | 112 +++++++++++++++++++----- .github/workflows/ci-gh.yml | 13 ++- .github/workflows/gh-build-and-test.yml | 21 +++-- .github/workflows/gh-build.yml | 13 ++- 5 files changed, 178 insertions(+), 89 deletions(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 952fb9cd..e1552ae8 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -13,73 +13,79 @@ inputs: host-platform: required: true type: string - use-container: - required: true - type: boolean - docker-image: - type: string - required: true upload-enabled: required: true type: boolean - python-version: - required: true - type: string runs: using: composite steps: + - name: Build cuda.core wheel + uses: pypa/cibuildwheel@v2.22.0 + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + CIBW_ARCHS_LINUX: "native" + CIBW_BUILD_VERBOSITY: 1 + # # ensure Python.h & co can be found + # CIBW_BEFORE_BUILD_WINDOWS: > + # python -c "import sysconfig; print(sysconfig.get_path('include'))" >> $env:INCLUDE + with: + package-dir: ./cuda_core/ + output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - if: ${{ inputs.use-container }} - name: Build (in container) - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - - docker run \ - -e AWS_REGION \ - -e AWS_SESSION_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e GITHUB_TOKEN \ - -e BINDINGS_ARTIFACTS_DIR="$BINDINGS_ARTIFACTS_DIR" \ - -e CORE_ARTIFACTS_DIR="$CORE_ARTIFACTS_DIR" \ - -e UPLOAD_ENABLED="$UPLOAD_ENABLED" \ - -e USE_CUDA="$USE_CUDA" \ - -e REPO_DIR="$REPO_DIR" \ - -e LEGATE_CORE_BUILD_MODE="$LEGATE_CORE_BUILD_MODE" \ - -e PYTHON_VERSION="$PYTHON_VERSION" \ - -v "${{ env.REPO_DIR }}:${{ env.REPO_DIR }}" \ - -v "${{ env.BINDINGS_ARTIFACTS_DIR }}:${{ env.BINDINGS_ARTIFACTS_DIR }}" \ - -v "${{ env.CORE_ARTIFACTS_DIR }}:${{ env.CORE_ARTIFACTS_DIR }}" \ - --rm "${{ inputs.docker-image }}" \ - /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}" - - - if: ${{ !inputs.use-container }} - name: Build (without container) - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/build" "${{ inputs.build-type}}" "${{ inputs.target-device }}" - - - name: Display structure of the bindings artifacts folder (post build) + - name: List the cuda.core artifacts directory shell: bash --noprofile --norc -xeuo pipefail {0} run: | - sudo chown -R $(whoami) ${{ env.BINDINGS_ARTIFACTS_DIR }} - ls -lahR ${{ env.BINDINGS_ARTIFACTS_DIR }} + if [[ "${{ inputs.host-platform }}" == win* ]]; then + export CHOWN=chown + else + export CHOWN="sudo chown" + fi + $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - name: Upload bindings build artifacts + - name: Upload cuda.core build artifacts uses: actions/upload-artifact@v4 with: - name: ${{ env.BINDINGS_ARTIFACT_NAME }} - path: ${{ env.BINDINGS_ARTIFACTS_DIR }} + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} + path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl + if-no-files-found: error + overwrite: 'true' + + - name: Build cuda.bindings wheel + uses: pypa/cibuildwheel@v2.22.0 + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + CIBW_ARCHS_LINUX: "native" + CIBW_BUILD_VERBOSITY: 1 + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH="$(realpath ./cuda_toolkit)" + PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_HOME="$(cygpath -w $(realpath ./cuda_toolkit))" + # PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} + # # ensure Python.h & co can be found + # CIBW_BEFORE_BUILD_WINDOWS: > + # python -c "import sysconfig; print(sysconfig.get_path('include'))" >> $env:INCLUDE + with: + package-dir: ./cuda_bindings/ + output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - name: Display structure of the core artifacts folder (post build) + - name: List the cuda.bindings artifacts directory shell: bash --noprofile --norc -xeuo pipefail {0} run: | - sudo chown -R $(whoami) ${{ env.CORE_ARTIFACTS_DIR }} - ls -lahR ${{ env.CORE_ARTIFACTS_DIR }} + if [[ "${{ inputs.host-platform }}" == win* ]]; then + export CHOWN=chown + else + export CHOWN="sudo chown" + fi + $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - name: Upload core build artifacts + - name: Upload cuda.bindings build artifacts uses: actions/upload-artifact@v4 with: - name: ${{ env.CORE_ARTIFACT_NAME }} - path: ${{ env.CORE_ARTIFACTS_DIR }} + name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} + path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl + if-no-files-found: error + overwrite: 'true' diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index c2a8407c..e00cf27f 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -22,34 +22,106 @@ inputs: python-version: required: true type: string + cuda-version: + required: true + type: string runs: using: composite steps: - - name: Set REPO_DIR and Dump environment + # WAR: setup-python is not relocatable... + # see https://github.com/actions/setup-python/issues/871 + - name: Set up Python ${{ inputs.python-version }} + if: ${{ startsWith(inputs.host-platform, 'linux') }} + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Set up MSVC + if: ${{ startsWith(inputs.host-platform, 'win') }} + uses: ilammy/msvc-dev-cmd@v1 + + - name: Dump environment shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "REPO_DIR=$(pwd)" >> $GITHUB_ENV env - - name: Set environment variables + - name: Get CUDA components shell: bash --noprofile --norc -xeuo pipefail {0} run: | + CUDA_PATH="./cuda_toolkit" + mkdir $CUDA_PATH - WITH_TESTS_STR='' - if [[ ("${{ inputs.upload-enabled }}" == "false") && ("${{ inputs.build-type }}" != "ci") ]]; then - WITH_TESTS_STR='-with_tests' + # The binary archives (redist) are guaranteed to be updated as part of the release posting. + CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/" + CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json" + if [[ "${{ inputs.host-platform }}" == linux* ]]; then + if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then + CTK_SUBDIR="linux-x86_64" + elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then + CTK_SUBDIR="linux-sbsa" + fi + function extract() { + tar -xvf $1 -C $CUDA_PATH --strip-components=1 + } + elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then + CTK_SUBDIR="windows-x86_64" + function extract() { + _TEMP_DIR_=$(mktemp -d) + unzip $1 -d $_TEMP_DIR_ + cp -r $_TEMP_DIR_/*/* $CUDA_PATH + rm -rf $_TEMP_DIR_ + } fi + function populate_cuda_path() { + # take the component name as a argument + function download() { + curl -kLSs $1 -o $2 + } + CTK_COMPONENT=$1 + CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL | + python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")" + CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}" + CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)" + download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME + extract $CTK_COMPONENT_COMPONENT_FILENAME + rm $CTK_COMPONENT_COMPONENT_FILENAME + } - TARGET_PLATFORM='linux-64' - if [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then + # Get headers and shared libraries in place + populate_cuda_path cuda_nvcc + populate_cuda_path cuda_cudart + populate_cuda_path cuda_nvrtc + populate_cuda_path cuda_profiler_api + ls -l $CUDA_PATH + + # Note: the headers will be copied into the cibuildwheel manylinux container, + # so setting the CUDA_PATH env var here is meaningless. + + - name: Set environment variables + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + # TODO: just align host-platform names with TARGET_PLATFORM... + if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then + TARGET_PLATFORM='linux-64' + elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then TARGET_PLATFORM='linux-aarch64' + elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then + TARGET_PLATFORM='win-64' fi - BUILD_MODE="${{ inputs.build-mode }}" - BUILD_MODE_STR="" - [ -n "${BUILD_MODE}" ] && BUILD_MODE_STR="-${BUILD_MODE}" + PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.') + if [[ "${{ inputs.host-platform }}" == linux* ]]; then + CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-manylinux*" + REPO_DIR=$(pwd) + elif [[ "${{ inputs.host-platform }}" == win* ]]; then + CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-win_amd64" + PWD=$(pwd) + REPO_DIR=$(cygpath -w $PWD) + fi + BUILD_MODE="${{ inputs.build-mode }}" if [[ ("${BUILD_MODE}" == "") || ("${BUILD_MODE}" == "release") ]]; then # We upload release versions in the default folder. PKG_DIR="${TARGET_PLATFORM}" @@ -57,16 +129,14 @@ runs: PKG_DIR="${BUILD_MODE}/${TARGET_PLATFORM}" fi - PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.') - - echo "BINDINGS_ARTIFACT_NAME=${{ inputs.host-platform }}-${{ inputs.build-type }}-cuda_bindings-python${PYTHON_VERSION_FORMATTED}-${{ inputs.target-device }}${BUILD_MODE_STR}${WITH_TESTS_STR}-${{ github.sha }}" >> $GITHUB_ENV - echo "BINDINGS_ARTIFACTS_DIR=$(realpath "$(pwd)/cuda_bindings/dist")" >> $GITHUB_ENV - echo "CORE_ARTIFACT_NAME=${{ inputs.host-platform }}-${{ inputs.build-type }}-cuda_core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.target-device }}${BUILD_MODE_STR}${WITH_TESTS_STR}-${{ github.sha }}" >> $GITHUB_ENV - echo "CORE_ARTIFACTS_DIR=$(realpath "$(pwd)/cuda_core/dist")" >> $GITHUB_ENV - echo "USE_CUDA=${{ (inputs.target-device == 'cpu' && 'OFF') || 'ON' }}" >> $GITHUB_ENV + echo "PARALLEL_LEVEL=$(nproc)" >> $GITHUB_ENV + echo "REPO_DIR=$REPO_DIR" >> $GITHUB_ENV + echo "PKG_DIR=${PKG_DIR}" >> $GITHUB_ENV + echo "CUDA_CORE_ARTIFACT_NAME=cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.host-platform }}-${{ inputs.build-type }}-${{ github.sha }}" >> $GITHUB_ENV + echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV + echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ inputs.cuda-version }}-${{ inputs.host-platform }}-${{ inputs.build-type }}-${{ github.sha }}" >> $GITHUB_ENV + echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV echo "UPLOAD_ENABLED=${{ (inputs.upload-enabled == 'true' && 'ON') || 'OFF' }}" >> $GITHUB_ENV - echo "LEGATE_CORE_BUILD_MODE=${BUILD_MODE}" >> $GITHUB_ENV echo "BUILD_DATE=$(date +%Y%m%d)" >> $GITHUB_ENV echo "TARGET_PLATFORM=${TARGET_PLATFORM}" >> $GITHUB_ENV - echo "PKG_DIR=${PKG_DIR}" >> $GITHUB_ENV - echo "PYTHON_VERSION=${{ inputs.python-version }}" >> $GITHUB_ENV + echo "CIBW_BUILD=${CIBW_BUILD}" >> $GITHUB_ENV diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index d38cb8e3..1975c3b5 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -18,6 +18,8 @@ jobs: matrix: host-platform: - linux-x64 + - linux-aarch64 + - win-x64 target-device: - gpu build-mode: @@ -25,8 +27,14 @@ jobs: upload-enabled: - false python-version: - #TODO cover the whole python and cuda matrix - - 3.12 + - "3.12" + - "3.11" + - "3.10" + - "3.9" + cuda-version: + # Note: this is for build-time only; the test-time matrix needs to be + # defined separately. + - "12.6.2" uses: ./.github/workflows/gh-build-and-test.yml with: @@ -36,4 +44,5 @@ jobs: build-type: ci upload-enabled: ${{ matrix.upload-enabled }} python-version: ${{ matrix.python-version }} + cuda-version: ${{ matrix.cuda-version }} secrets: inherit diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index ffc6f959..a9a711d4 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -1,16 +1,16 @@ on: workflow_call: inputs: - host-platform: + target-device: type: string required: true - target-device: + build-type: type: string required: true - build-mode: + host-platform: type: string required: true - build-type: + build-mode: type: string required: true upload-enabled: @@ -19,6 +19,10 @@ on: python-version: type: string required: true + cuda-version: + type: string + required: true + jobs: build: if: ${{ github.repository_owner == 'nvidia' }} @@ -28,13 +32,14 @@ jobs: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || - (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') }} + (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || + (inputs.host-platform == 'win-x64' && 'windows-2019') }} + # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} build-type: ${{ inputs.build-type }} - use-container: ${{ inputs.host-platform == 'linux-x64' || - inputs.host-platform == 'linux-aarch64'}} host-platform: ${{ inputs.host-platform }} - dependencies-file: "" build-mode: ${{ inputs.build-mode }} upload-enabled: ${{ inputs.upload-enabled }} python-version: ${{ inputs.python-version }} + cuda-version: ${{ inputs.cuda-version }} + dependencies-file: "" secrets: inherit diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml index c60e0c2a..7a9f03ce 100644 --- a/.github/workflows/gh-build.yml +++ b/.github/workflows/gh-build.yml @@ -16,9 +16,6 @@ on: required: true type: string description: One of ci / release - use-container: - required: true - type: boolean host-platform: required: true type: string @@ -35,10 +32,13 @@ on: python-version: required: true type: string + cuda-version: + required: true + type: string jobs: build: - name: Build (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} ) + name: Build (${{ inputs.host-platform }}, ${{ inputs.build-type }}, ${{ inputs.build-mode }}, Python "${{ inputs.python-version }}") permissions: id-token: write # This is required for configure-aws-credentials @@ -52,7 +52,7 @@ jobs: with: fetch-depth: 0 - - name: Setup + - name: Set up build environment uses: ./.github/actions/setup with: client-repo: ${{ inputs.client-repo }} @@ -62,6 +62,7 @@ jobs: build-mode: ${{ inputs.build-mode }} upload-enabled: ${{ inputs.upload-enabled }} python-version: ${{ inputs.python-version }} + cuda-version: ${{ inputs.cuda-version }} - name: Call build action uses: ./.github/actions/build @@ -69,6 +70,4 @@ jobs: build-type: ${{ inputs.build-type }} target-device: "${{ inputs.target-device }}" host-platform: ${{ inputs.host-platform }} - use-container: ${{ inputs.use-container }} - docker-image: "condaforge/miniforge3:latest" upload-enabled: ${{ inputs.upload-enabled }} From 8118f68bb6753e8f6faeb71350244138d04685a5 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Dec 2024 06:15:50 +0000 Subject: [PATCH 062/111] add PY313 build pipelines --- .github/workflows/ci-gh.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 1975c3b5..0b965a52 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -27,6 +27,7 @@ jobs: upload-enabled: - false python-version: + - "3.13" - "3.12" - "3.11" - "3.10" From 9fdbc9fe49cd9eb6a84fdf78326f5db7ea584969 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 6 Dec 2024 15:30:04 -0800 Subject: [PATCH 063/111] remove duplicate test --- cuda_core/tests/test_linker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 6163d9a8..a9b5d1c2 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -50,7 +50,6 @@ def compile_ltoir_functions(init_cuda): culink_options = [ LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), - LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), LinkerOptions(arch=ARCH, lineinfo=True), From 677bd6df015834ad5fed6d2cd075623c1935229b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 02:38:20 +0000 Subject: [PATCH 064/111] reuse backend decision logic in tests + some nitpicks --- cuda_core/cuda/core/experimental/_linker.py | 36 ++++++++++++++------- cuda_core/tests/test_linker.py | 27 +++++----------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 01c4a0e9..8cd603d1 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -20,29 +20,43 @@ _nvjitlink_input_types = None # populated if nvJitLink cannot be used -def _lazy_init(): - global _inited - if _inited: +# Note: this function is reused in the tests +def _decide_nvjitlink_or_driver(): + """Returns True if falling back to the cuLink* driver APIs.""" + global _driver_ver, _driver, _nvjitlink + if _driver or _nvjitlink: return - global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types _driver_ver = handle_return(cuda.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) try: - from cuda.bindings import nvjitlink + from cuda.bindings import nvjitlink as _nvjitlink from cuda.bindings._internal import nvjitlink as inner_nvjitlink except ImportError: # binding is not available - nvjitlink = None + _nvjitlink = None else: if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: # binding is available, but nvJitLink is not installed - nvjitlink = None - elif _driver_ver > nvjitlink.version(): + _nvjitlink = None + + if _nvjitlink is None: + _driver = cuda + return True + else: + return False + + +def _lazy_init(): + global _inited, _nvjitlink_input_types, _driver_input_types + if _inited: + return + + _decide_nvjitlink_or_driver() + if _nvjitlink: + if _driver_ver > _nvjitlink.version(): # TODO: nvJitLink is not new enough, warn? pass - if nvjitlink: - _nvjitlink = nvjitlink _nvjitlink_input_types = { "ptx": _nvjitlink.InputType.PTX, "cubin": _nvjitlink.InputType.CUBIN, @@ -51,8 +65,6 @@ def _lazy_init(): "object": _nvjitlink.InputType.OBJECT, } else: - from cuda import cuda as _driver - _driver_input_types = { "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index a9b5d1c2..1af746f8 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,48 +1,37 @@ import pytest -from cuda.core.experimental import Linker, LinkerOptions, Program +from cuda.core.experimental import Linker, LinkerOptions, Program, _linker from cuda.core.experimental._module import ObjectCode ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper - -device_function_a = """ -__device__ int B(); -__device__ int C(int a, int b); +kernel_a = """ +extern __device__ int B(); +extern __device__ int C(int a, int b); __global__ void A() { int result = C(B(), 1);} """ device_function_b = "__device__ int B() { return 0; }" device_function_c = "__device__ int C(int a, int b) { return a + b; }" -culink_backend = False -try: - from cuda.bindings import nvjitlink # noqa F401 - from cuda.bindings._internal import nvjitlink as inner_nvjitlink -except ImportError: - # binding is not available - culink_backend = True -else: - if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0: - # binding is available, but nvJitLink is not installed - culink_backend = True +culink_backend = _linker._decide_nvjitlink_or_driver() @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): - # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced + # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced # device functions, causing the link to fail + object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",)) object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) - object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",)) return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): + object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",)) object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) - object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",)) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir From 758ae01a7855775b88b91ab7250c34c33225eca8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:24:23 +0000 Subject: [PATCH 065/111] make _exception_manager a ctx mgr --- cuda_core/cuda/core/experimental/_linker.py | 88 ++++++++++----------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 8cd603d1..c27ea94d 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -4,6 +4,7 @@ import ctypes import weakref +from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional @@ -329,6 +330,26 @@ def _init_driver(self): self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE) +# This needs to be a free function not a method, as it's disallowed by contextmanager. +@contextmanager +def _exception_manager(self): + """ + A helper function to improve the error message of exceptions raised by the linker backend. + """ + try: + yield + except Exception as e: + error_log = "" + if hasattr(self, "_mnff"): + # our constructor could raise, in which case there's no handle available + error_log = self.get_error_log() + # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but + # unfortunately we are still supporting Python 3.9/3.10... + # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0]. + e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:]) + raise e + + class Linker: """ Linker class for managing the linking of object codes with specified options. @@ -364,46 +385,21 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): raise ValueError("At least one ObjectCode object must be provided") self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - if _nvjitlink: - handle = self._exception_manager( - lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options) - ) - - use_nvjitlink = True - else: - handle = self._exception_manager( - lambda: handle_return( + with _exception_manager(self): + if _nvjitlink: + handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options) + use_nvjitlink = True + else: + handle = handle_return( _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options) ) - ) - use_nvjitlink = False + use_nvjitlink = False self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) for code in object_codes: assert isinstance(code, ObjectCode) self._add_code_object(code) - def _exception_manager(self, action): - """ - Helper function to improve the error message of excepotions raised by the linker backend. - - Parameters - ---------- - action : callable - The action to be performed. - - Returns - ------- - The return value of the action. - """ - try: - return action() - except Exception as e: - error = self.get_error_log() - raise RuntimeError( - f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}" - ) from e - def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) @@ -450,21 +446,21 @@ def link(self, target_type) -> ObjectCode: """ if target_type not in ("cubin", "ptx"): raise ValueError(f"Unsupported target type: {target_type}") - if _nvjitlink: - self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle)) - if target_type == "cubin": - get_size = _nvjitlink.get_linked_cubin_size - get_code = _nvjitlink.get_linked_cubin + with _exception_manager(self): + if _nvjitlink: + _nvjitlink.complete(self._mnff.handle) + if target_type == "cubin": + get_size = _nvjitlink.get_linked_cubin_size + get_code = _nvjitlink.get_linked_cubin + else: + get_size = _nvjitlink.get_linked_ptx_size + get_code = _nvjitlink.get_linked_ptx + size = get_size(self._mnff.handle) + code = bytearray(size) + get_code(self._mnff.handle, code) else: - get_size = _nvjitlink.get_linked_ptx_size - get_code = _nvjitlink.get_linked_ptx - - size = self._exception_manager(lambda: get_size(self._mnff.handle)) - code = bytearray(size) - self._exception_manager(lambda: get_code(self._mnff.handle, code)) - else: - addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle))) - code = (ctypes.c_char * size).from_address(addr) + addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) + code = (ctypes.c_char * size).from_address(addr) return ObjectCode(bytes(code), target_type) From 06ee1e28e875e4eab869ce530fd00560ec010a8f Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:33:48 +0000 Subject: [PATCH 066/111] also guard the add_data calls with _exception_manager + add missing docstrings --- cuda_core/cuda/core/experimental/_linker.py | 49 ++++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index c27ea94d..a1f93e18 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -403,27 +403,28 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): def _add_code_object(self, object_code: ObjectCode): data = object_code._module assert isinstance(data, bytes) - if _nvjitlink: - _nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), - data, - len(data), - f"{object_code._handle}_{object_code._code_type}", - ) - else: - handle_return( - _driver.cuLinkAddData( + with _exception_manager(self): + if _nvjitlink: + _nvjitlink.add_data( self._mnff.handle, self._input_type_from_code_type(object_code._code_type), data, len(data), - f"{object_code._handle}_{object_code._code_type}".encode(), - 0, - None, - None, + f"{object_code._handle}_{object_code._code_type}", + ) + else: + handle_return( + _driver.cuLinkAddData( + self._mnff.handle, + self._input_type_from_code_type(object_code._code_type), + data, + len(data), + f"{object_code._handle}_{object_code._code_type}".encode(), + 0, + None, + None, + ) ) - ) def link(self, target_type) -> ObjectCode: """ @@ -465,6 +466,12 @@ def link(self, target_type) -> ObjectCode: return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: + """ Get the error log generated by the linker. + + Returns + ------- + The error log. + """ if _nvjitlink: log_size = _nvjitlink.get_error_log_size(self._mnff.handle) log = bytearray(log_size) @@ -474,6 +481,12 @@ def get_error_log(self) -> str: return log.decode() def get_info_log(self) -> str: + """Get the info log generated by the linker. + + Returns + ------- + The info log. + """ if _nvjitlink: log_size = _nvjitlink.get_info_log_size(self._mnff.handle) log = bytearray(log_size) @@ -492,8 +505,10 @@ def _input_type_from_code_type(self, code_type: str): return input_type @property - def handle(self) -> int: + def handle(self): + """Return the linker handle object.""" return self._mnff.handle def close(self): + """Destroy this linker.""" self._mnff.close() From faf4855b46d363715ae75921364464b5117cd9e4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 03:36:11 +0000 Subject: [PATCH 067/111] add missing license header --- cuda_core/tests/test_linker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 1af746f8..54cd8cf4 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import pytest from cuda.core.experimental import Linker, LinkerOptions, Program, _linker From 1c9dea6bfc3cb3112ab065366d1d7832f99478a4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 04:17:40 +0000 Subject: [PATCH 068/111] improve docs --- cuda_core/cuda/core/experimental/_linker.py | 127 ++++++++----------- cuda_core/docs/source/release/0.1.1-notes.md | 9 +- 2 files changed, 58 insertions(+), 78 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index a1f93e18..09a237a4 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -77,118 +77,92 @@ def _lazy_init(): @dataclass class LinkerOptions: - """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available - whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer - or older versions of cuda. + """Customizable :obj:`Linker` options. + Since the linker would choose to use nvJitLink or the driver APIs as the linking backed, + not all options are applicable. Attributes ---------- arch : str - Pass SM architecture value. Can use compute_ value instead if only generating PTX. + Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). This is a required option. - Acceptable value type: str - Maps to: -arch=sm_ max_register_count : int, optional Maximum register count. - Default: None - Acceptable value type: int - Maps to: -maxrregcount= + Maps to: ``-maxrregcount=``. time : bool, optional - Print timing information to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -time + Print timing information to the info log. + Maps to ``-time``. + Default: False. verbose : bool, optional - Print verbose messages to InfoLog. - Default: False - Acceptable value type: bool - Maps to: -verbose + Print verbose messages to the info log. + Maps to ``-verbose``. + Default: False. link_time_optimization : bool, optional Perform link time optimization. - Default: False - Acceptable value type: bool - Maps to: -lto + Maps to: ``-lto``. + Default: False. ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with -lto. - Default: False - Acceptable value type: bool - Maps to: -ptx + Emit PTX after linking instead of CUBIN; only supported with ``-lto``. + Maps to ``-ptx``. + Default: False. optimization_level : int, optional Set optimization level. Only 0 and 3 are accepted. - Default: None - Acceptable value type: int - Maps to: -O + Maps to ``-O``. debug : bool, optional Generate debug information. - Default: False - Acceptable value type: bool - Maps to: -g + Maps to ``-g`` + Default: False. lineinfo : bool, optional Generate line information. - Default: False - Acceptable value type: bool - Maps to: -lineinfo + Maps to ``-lineinfo``. + Default: False. ftz : bool, optional Flush denormal values to zero. - Default: False - Acceptable value type: bool - Maps to: -ftz= + Maps to ``-ftz=``. + Default: False. prec_div : bool, optional Use precise division. - Default: True - Acceptable value type: bool - Maps to: -prec-div= + Maps to ``-prec-div=``. + Default: True. prec_sqrt : bool, optional Use precise square root. - Default: True - Acceptable value type: bool - Maps to: -prec-sqrt= + Maps to ``-prec-sqrt=``. + Default: True. fma : bool, optional Use fast multiply-add. - Default: True - Acceptable value type: bool - Maps to: -fma= + Maps to ``-fma=``. + Default: True. kernels_used : List[str], optional Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple times. - Default: None - Acceptable value type: list of str - Maps to: -kernels-used= + Maps to ``-kernels-used=``. variables_used : List[str], optional - Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple - times. - Default: None - Acceptable value type: list of str - Maps to: -variables-used= + Pass a list of variables that are used; any not in the list can be removed. + Maps to ``-variables-used=`` optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. - Default: False - Acceptable value type: bool - Maps to: -optimize-unused-variables + Maps to: ``-optimize-unused-variables`` + Default: False. xptxas : List[str], optional - Pass options to PTXAS. This option can be called multiple times. - Default: None - Acceptable value type: list of str - Maps to: -Xptxas= + Pass options to PTXAS. + Maps to: ``-Xptxas=``. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split compilation (default). - Default: 1 - Acceptable value type: int - Maps to: -split-compile= + Maps to ``-split-compile=``. + Default: 1. split_compile_extended : int, optional A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This option can potentially impact performance of the compiled binary. - Default: 1 - Acceptable value type: int - Maps to: -split-compile-extended= + Maps to ``-split-compile-extended=``. + Default: 1. no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. - Default: False - Acceptable value type: bool - Maps to: -no-cache + Maps to ``-no-cache``. + Default: False. """ arch: str @@ -351,8 +325,11 @@ def _exception_manager(self): class Linker: - """ - Linker class for managing the linking of object codes with specified options. + """Represent a linking machinery to link one or multiple object codes into + :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options. + + This object provides a unified interface to multiple underlying + linker libraries (such as nvJitLink or cuLink* from CUDA driver). Parameters ---------- @@ -442,7 +419,7 @@ def link(self, target_type) -> ObjectCode: Note ------ - See nvrtc compiler options documnetation to ensure the input ObjectCodes are + See nvrtc compiler options documnetation to ensure the input object codes are correctly compiled for linking. """ if target_type not in ("cubin", "ptx"): @@ -470,7 +447,8 @@ def get_error_log(self) -> str: Returns ------- - The error log. + str + The error log. """ if _nvjitlink: log_size = _nvjitlink.get_error_log_size(self._mnff.handle) @@ -485,7 +463,8 @@ def get_info_log(self) -> str: Returns ------- - The info log. + str + The info log. """ if _nvjitlink: log_size = _nvjitlink.get_info_log_size(self._mnff.handle) diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index cd3530b9..34cad7d1 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -1,13 +1,14 @@ # `cuda.core` Release notes -Released on Nov , 2024 +Released on Dec XX, 2024 ## Hightlights - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. -- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API - depending on your CUDA version. +- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`s. Under + the hood, it uses either the nvJitLink or cuLink APIs depending on the CUDA version detected + in the current environment. - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools @@ -15,6 +16,6 @@ Released on Nov , 2024 - All APIs are currently *experimental* and subject to change without deprecation notice. Please kindly share your feedbacks with us so that we can make `cuda.core` better! -- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, +- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, the backend is the cuLink api which supports only a subset of the options that nvjitlink does. Further, some options aren't available on CUDA versions <12.6 From 1a3f1e64f55b2f0f4840ed547c31efa57c775c97 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 06:22:27 +0000 Subject: [PATCH 069/111] fix docs --- cuda_core/cuda/core/experimental/__init__.py | 6 +- cuda_core/cuda/core/experimental/_linker.py | 2 +- cuda_core/cuda/core/experimental/_system.py | 139 +++++++++---------- cuda_core/docs/source/api.rst | 12 +- cuda_core/docs/source/conf.py | 17 +++ cuda_core/docs/source/release.md | 1 - cuda_core/tests/test_system.py | 71 +++++----- 7 files changed, 138 insertions(+), 110 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 982226c7..15df70bb 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -9,4 +9,8 @@ from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions -from cuda.core.experimental._system import system +from cuda.core.experimental._system import System + +system = System() +__import__("sys").modules[__spec__.name + ".system"] = system +del System diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 09a237a4..2beeb168 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -443,7 +443,7 @@ def link(self, target_type) -> ObjectCode: return ObjectCode(bytes(code), target_type) def get_error_log(self) -> str: - """ Get the error log generated by the linker. + """Get the error log generated by the linker. Returns ------- diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py index 258f9bcd..31c7af6f 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/experimental/_system.py @@ -1,72 +1,67 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -from typing import Tuple - -from cuda import cuda, cudart -from cuda.core.experimental._device import Device -from cuda.core.experimental._utils import handle_return - - -class System: - """ Provide information about the cuda system. - This class is a singleton and should not be instantiated directly. - """ - - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - if hasattr(self, '_initialized') and self._initialized: - return - self._initialized = True - - @property - def driver_version(self) -> Tuple[int, int]: - """ - Query the CUDA driver version. - - Returns - ------- - tuple of int - A 2-tuple of (major, minor) version numbers. - """ - version = handle_return(cuda.cuDriverGetVersion()) - major = version // 1000 - minor = (version % 1000) // 10 - return (major, minor) - - @property - def num_devices(self) -> int: - """ - Query the number of available GPUs. - - Returns - ------- - int - The number of available GPU devices. - """ - return handle_return(cudart.cudaGetDeviceCount()) - - @property - def devices(self) -> tuple: - """ - Query the available device instances. - - Returns - ------- - tuple of Device - A tuple containing instances of available devices. - """ - total = self.num_devices - return tuple(Device(device_id) for device_id in range(total)) - -system = System() -system.__doc__ = """ -Singleton instance of the :obj:`_system.System` class. -""" +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from typing import Tuple + +from cuda import cuda, cudart +from cuda.core.experimental._device import Device +from cuda.core.experimental._utils import handle_return + + +class System: + """Provide information about the cuda system. + This class is a singleton and should not be instantiated directly. + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if hasattr(self, "_initialized") and self._initialized: + return + self._initialized = True + + @property + def driver_version(self) -> Tuple[int, int]: + """ + Query the CUDA driver version. + + Returns + ------- + tuple of int + A 2-tuple of (major, minor) version numbers. + """ + version = handle_return(cuda.cuDriverGetVersion()) + major = version // 1000 + minor = (version % 1000) // 10 + return (major, minor) + + @property + def num_devices(self) -> int: + """ + Query the number of available GPUs. + + Returns + ------- + int + The number of available GPU devices. + """ + return handle_return(cudart.cudaGetDeviceCount()) + + @property + def devices(self) -> tuple: + """ + Query the available device instances. + + Returns + ------- + tuple of Device + A tuple containing instances of available devices. + """ + total = self.num_devices + return tuple(Device(device_id) for device_id in range(total)) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index bd63f0f0..4b30c6ef 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -16,7 +16,6 @@ CUDA runtime Device launch - system :template: dataclass.rst @@ -39,6 +38,17 @@ CUDA compilation toolchain LinkerOptions +CUDA system information +----------------------- + +.. autodata:: cuda.core.experimental.system.driver_version + :no-value: +.. autodata:: cuda.core.experimental.system.num_devices + :no-value: +.. autodata:: cuda.core.experimental.system.devices + :no-value: + + .. module:: cuda.core.experimental.utils Utility functions diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 4621e887..3a7afc09 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -91,3 +91,20 @@ napoleon_google_docstring = False napoleon_numpy_docstring = True + + +def autodoc_process_docstring(app, what, name, obj, options, lines): + if name.startswith("cuda.core.experimental.system"): + # patch the docstring (in lines) *in-place* + attr = name.split(".")[-1] + from cuda.core.experimental._system import System + + lines_new = getattr(System, attr).__doc__.split("\n") + n_pops = len(lines) + lines.extend(lines_new) + for _ in range(n_pops): + lines.pop(0) + + +def setup(app): + app.connect("autodoc-process-docstring", autodoc_process_docstring) diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 11accb59..a9e16d6e 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -7,6 +7,5 @@ maxdepth: 3 0.1.1 0.1.0 - 0.1.1 ``` diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index 893d1206..7a39388f 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -1,34 +1,37 @@ -try: - from cuda.bindings import driver, runtime -except ImportError: - from cuda import cuda as driver - from cuda import cudart as runtime - -from cuda.core.experimental import Device, system -from cuda.core.experimental._utils import handle_return - - -def test_system_singleton(): - system1 = system - system2 = system - assert id(system1) == id(system2), "system is not a singleton" - -def test_driver_version(): - driver_version = system.driver_version - print(driver_version) - version = handle_return(driver.cuDriverGetVersion()) - expected_driver_version = (version // 1000, (version % 1000) // 10) - assert driver_version == expected_driver_version, "Driver version does not match expected value" - -def test_num_devices(): - num_devices = system.num_devices - expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) - assert num_devices == expected_num_devices, "Number of devices does not match expected value" - -def test_devices(): - devices = system.devices - expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) - expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) - assert len(devices) == len(expected_devices), "Number of devices does not match expected value" - for device, expected_device in zip(devices, expected_devices): - assert device.device_id == expected_device.device_id, "Device ID does not match expected value" +try: + from cuda.bindings import driver, runtime +except ImportError: + from cuda import cuda as driver + from cuda import cudart as runtime + +from cuda.core.experimental import Device, system +from cuda.core.experimental._utils import handle_return + + +def test_system_singleton(): + system1 = system + system2 = system + assert id(system1) == id(system2), "system is not a singleton" + + +def test_driver_version(): + driver_version = system.driver_version + print(driver_version) + version = handle_return(driver.cuDriverGetVersion()) + expected_driver_version = (version // 1000, (version % 1000) // 10) + assert driver_version == expected_driver_version, "Driver version does not match expected value" + + +def test_num_devices(): + num_devices = system.num_devices + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + assert num_devices == expected_num_devices, "Number of devices does not match expected value" + + +def test_devices(): + devices = system.devices + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) + assert len(devices) == len(expected_devices), "Number of devices does not match expected value" + for device, expected_device in zip(devices, expected_devices): + assert device.device_id == expected_device.device_id, "Device ID does not match expected value" From aeebaf757808cad15ed9c241e011457a9bfa97e4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 15:11:53 -0500 Subject: [PATCH 070/111] skip testing on win; remove mac --- .github/workflows/gh-build-and-test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 913b17fd..2bd7ec97 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -45,7 +45,8 @@ jobs: secrets: inherit test: - if: ${{ github.repository_owner == 'nvidia' }} + if: ${{ (github.repository_owner == 'nvidia') && + !startsWith(inputs.host-platform, 'win) }} needs: - build uses: @@ -54,7 +55,8 @@ jobs: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} test-options: ${{ inputs.build-type }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || + (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') }} runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} build-type: ${{ inputs.build-type }} host-platform: ${{ inputs.host-platform }} From 76a8822fb5df845bb3a0e15c8407c651dc9a2e89 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 15:15:06 -0500 Subject: [PATCH 071/111] fix typo --- .github/workflows/gh-build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 2bd7ec97..ab32a62a 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -46,7 +46,7 @@ jobs: test: if: ${{ (github.repository_owner == 'nvidia') && - !startsWith(inputs.host-platform, 'win) }} + !startsWith(inputs.host-platform, 'win') }} needs: - build uses: From c23467f24badb0424648b6c9ee89c5913bdd570a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 15:52:45 -0500 Subject: [PATCH 072/111] skip setup if build stage was called --- .github/workflows/gh-test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml index 74f1c520..01eae085 100644 --- a/.github/workflows/gh-test.yml +++ b/.github/workflows/gh-test.yml @@ -39,6 +39,10 @@ on: python-version: required: false type: string + has-built: + required: false + type: boolean + description: whether the built stage was launched (and passed) jobs: build: @@ -63,6 +67,7 @@ jobs: fetch-depth: 0 - name: Setup + if: ${{ !inputs.has-built }} uses: ./.github/actions/setup with: client-repo: ${{ inputs.client-repo }} From 3d892ad9546187edf8cbe23386e1cdf1ac16ac48 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 16:05:25 -0500 Subject: [PATCH 073/111] set build output --- .github/actions/build/action.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index e1552ae8..48c4e50a 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -16,6 +16,10 @@ inputs: upload-enabled: required: true type: boolean +outputs: + has-built: + value: true # TODO: we might need to check the job success here + description: whether the built stage was launched (and passed) runs: using: composite From ae0a994416edb6d63f58fce1c2e41ac32a6c44ea Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 16:09:35 -0500 Subject: [PATCH 074/111] pass output from build to test --- .github/workflows/gh-build-and-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index ab32a62a..f9606ba7 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -64,4 +64,5 @@ jobs: build-mode: ${{ inputs.build-mode }} upload-enabled: ${{ inputs.upload-enabled }} python-version: ${{ inputs.python-version }} + has-built: ${{ needs.build.outputs.has-built }} secrets: inherit From c3fe6a14881d4bc1b968e1394ca6e05ca4009380 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 16:25:03 -0500 Subject: [PATCH 075/111] wrong place --- .github/actions/build/action.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 48c4e50a..e1552ae8 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -16,10 +16,6 @@ inputs: upload-enabled: required: true type: boolean -outputs: - has-built: - value: true # TODO: we might need to check the job success here - description: whether the built stage was launched (and passed) runs: using: composite From 0bbc706c5fe61e59bda8bfd0091fd29484203b38 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 16:26:18 -0500 Subject: [PATCH 076/111] it's the build workflow, not action, that should have outputs --- .github/workflows/gh-build.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml index 7a9f03ce..c83fb00a 100644 --- a/.github/workflows/gh-build.yml +++ b/.github/workflows/gh-build.yml @@ -35,6 +35,10 @@ on: cuda-version: required: true type: string + outputs: + has-built: + value: true # TODO: we might need to check the job success here + description: whether the built stage was launched (and passed) jobs: build: From ba0bbdedcdaed83b182af9f62489cd84226e1f5c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 16:28:47 -0500 Subject: [PATCH 077/111] fix indentation --- .github/workflows/gh-build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml index c83fb00a..86b77516 100644 --- a/.github/workflows/gh-build.yml +++ b/.github/workflows/gh-build.yml @@ -35,10 +35,10 @@ on: cuda-version: required: true type: string - outputs: - has-built: - value: true # TODO: we might need to check the job success here - description: whether the built stage was launched (and passed) + outputs: + has-built: + value: true # TODO: we might need to check the job success here + description: whether the built stage was launched (and passed) jobs: build: From aed5bb6b1aa04c15633d6f68c841ee98b0294162 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 18:02:09 -0500 Subject: [PATCH 078/111] try to take output as a string --- .github/workflows/gh-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml index 01eae085..b0b94514 100644 --- a/.github/workflows/gh-test.yml +++ b/.github/workflows/gh-test.yml @@ -41,7 +41,7 @@ on: type: string has-built: required: false - type: boolean + type: string description: whether the built stage was launched (and passed) jobs: @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Setup - if: ${{ !inputs.has-built }} + if: ${{ inputs.has-built == 'true' }} uses: ./.github/actions/setup with: client-repo: ${{ inputs.client-repo }} From 330251d90c003df945ac4725df1cedb6b3f257e2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 18:12:52 -0500 Subject: [PATCH 079/111] fix logic --- .github/workflows/gh-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml index b0b94514..575ccada 100644 --- a/.github/workflows/gh-test.yml +++ b/.github/workflows/gh-test.yml @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Setup - if: ${{ inputs.has-built == 'true' }} + if: ${{ inputs.has-built != 'true' }} uses: ./.github/actions/setup with: client-repo: ${{ inputs.client-repo }} From 076104bbb52e6c3b06c5eb3ac00cfb8ecf6235c7 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 23:36:56 +0000 Subject: [PATCH 080/111] multiple fixes - fix artifact env vars - runner must have GPUs for testing - shorten workflow names --- .github/actions/test/action.yml | 19 +++++++------------ .github/workflows/ci-gh.yml | 3 +-- .github/workflows/gh-build-and-test.yml | 9 +++++---- .github/workflows/gh-build.yml | 2 +- .github/workflows/gh-test.yml | 7 +------ 5 files changed, 15 insertions(+), 25 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 018db9aa..675263fb 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -6,42 +6,37 @@ inputs: test-options: required: true type: string - runner-has-gpu: - required: true - type: boolean - description: "The runner has GPU(s)." runs: using: composite steps: - - if: ${{ inputs.runner-has-gpu == true }} - name: Run nvidia-smi to make sure GPU is working + - name: Run nvidia-smi to make sure GPU is working shell: bash --noprofile --norc -xeuo pipefail {0} run: nvidia-smi - name: Download bindings build artifacts uses: actions/download-artifact@v4 with: - name: ${{ env.BINDINGS_ARTIFACT_NAME }} - path: ${{ env.BINDINGS_ARTIFACTS_DIR }} + name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} + path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - name: Display structure of downloaded bindings artifacts shell: bash --noprofile --norc -xeuo pipefail {0} run: | pwd - ls -lahR $BINDINGS_ARTIFACTS_DIR + ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR - name: Download core build artifacts uses: actions/download-artifact@v4 with: - name: ${{ env.CORE_ARTIFACT_NAME }} - path: ${{ env.CORE_ARTIFACTS_DIR }} + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} + path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - name: Display structure of downloaded core build artifacts shell: bash --noprofile --norc -xeuo pipefail {0} run: | pwd - ls -lahR $CORE_ARTIFACTS_DIR + ls -lahR $CUDA_CORE_ARTIFACTS_DIR - name: Run test / analysis shell: bash --noprofile --norc -xeuo pipefail {0} diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 1975c3b5..7c493505 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -11,8 +11,7 @@ on: - "main" jobs: - build-and-test: - name: Build and test (${{ matrix.host-platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }}) + ci: strategy: fail-fast: false matrix: diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index f9606ba7..0f10fcff 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -25,6 +25,7 @@ on: jobs: build: + name: Build wheels if: ${{ github.repository_owner == 'nvidia' }} uses: ./.github/workflows/gh-build.yml @@ -45,8 +46,10 @@ jobs: secrets: inherit test: + name: Test against wheels + # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up if: ${{ (github.repository_owner == 'nvidia') && - !startsWith(inputs.host-platform, 'win') }} + startsWith(inputs.host-platform, 'linux-x64') }} needs: - build uses: @@ -55,9 +58,7 @@ jobs: client-repo: ${{ github.event.repository.name }} target-device: ${{ inputs.target-device }} test-options: ${{ inputs.build-type }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || - (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') }} - runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }} + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} build-type: ${{ inputs.build-type }} host-platform: ${{ inputs.host-platform }} dependencies-file: "" diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml index 86b77516..46026ba9 100644 --- a/.github/workflows/gh-build.yml +++ b/.github/workflows/gh-build.yml @@ -42,7 +42,7 @@ on: jobs: build: - name: Build (${{ inputs.host-platform }}, ${{ inputs.build-type }}, ${{ inputs.build-mode }}, Python "${{ inputs.python-version }}") + name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}") permissions: id-token: write # This is required for configure-aws-credentials diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml index 575ccada..8216ce10 100644 --- a/.github/workflows/gh-test.yml +++ b/.github/workflows/gh-test.yml @@ -15,10 +15,6 @@ on: runs-on: required: true type: string - runner-has-gpu: - required: true - type: boolean - description: "The runner has GPU(s)." build-type: required: true type: string @@ -46,7 +42,7 @@ on: jobs: build: - name: Test (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} ) + name: Test (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} ) permissions: id-token: write # This is required for configure-aws-credentials @@ -82,4 +78,3 @@ jobs: uses: ./.github/actions/test with: test-options: ${{ inputs.test-options }} - runner-has-gpu: ${{ inputs.runner-has-gpu }} From 7c6fba04dc68313cca9f84cece8b588db166200a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 00:41:48 +0000 Subject: [PATCH 081/111] merge build & test workflows to allow passing env vars; further simplify job names --- .github/workflows/ci-gh.yml | 3 +- .github/workflows/gh-build-and-test.yml | 101 ++++++++++++++++-------- .github/workflows/gh-build.yml | 77 ------------------ .github/workflows/gh-test.yml | 80 ------------------- 4 files changed, 69 insertions(+), 192 deletions(-) delete mode 100644 .github/workflows/gh-build.yml delete mode 100644 .github/workflows/gh-test.yml diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 7c493505..31446beb 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -1,5 +1,3 @@ -name: Build and test - concurrency: group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }} cancel-in-progress: true @@ -12,6 +10,7 @@ on: jobs: ci: + name: "CI" strategy: fail-fast: false matrix: diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 0f10fcff..f7296823 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -25,45 +25,80 @@ on: jobs: build: - name: Build wheels + name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}") if: ${{ github.repository_owner == 'nvidia' }} - uses: - ./.github/workflows/gh-build.yml - with: - client-repo: ${{ github.event.repository.name }} - target-device: ${{ inputs.target-device }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || - (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || - (inputs.host-platform == 'win-x64' && 'windows-2019') }} - # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} - build-type: ${{ inputs.build-type }} - host-platform: ${{ inputs.host-platform }} - build-mode: ${{ inputs.build-mode }} - upload-enabled: ${{ inputs.upload-enabled }} - python-version: ${{ inputs.python-version }} - cuda-version: ${{ inputs.cuda-version }} - dependencies-file: "" + permissions: + id-token: write # This is required for configure-aws-credentials + contents: read # This is required for actions/checkout + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || + (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || + (inputs.host-platform == 'win-x64' && 'windows-2019') }} + # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} secrets: inherit + steps: + - name: Checkout ${{ github.event.repository.name }} + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up build environment + uses: ./.github/actions/setup + with: + client-repo: ${{ github.event.repository.name }} + build-type: ${{ inputs.build-type }} + target-device: "${{ inputs.target-device }}" + host-platform: ${{ inputs.host-platform }} + build-mode: ${{ inputs.build-mode }} + upload-enabled: ${{ inputs.upload-enabled }} + python-version: ${{ inputs.python-version }} + cuda-version: ${{ inputs.cuda-version }} + + - name: Call build action + uses: ./.github/actions/build + with: + build-type: ${{ inputs.build-type }} + target-device: "${{ inputs.target-device }}" + host-platform: ${{ inputs.host-platform }} + upload-enabled: ${{ inputs.upload-enabled }} test: - name: Test against wheels + # TODO: improve the name once a separate test matrix is defined + name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }}) # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up if: ${{ (github.repository_owner == 'nvidia') && startsWith(inputs.host-platform, 'linux-x64') }} + permissions: + id-token: write # This is required for configure-aws-credentials + contents: read # This is required for actions/checkout + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} + secrets: inherit + container: + options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g + image: condaforge/miniforge3:latest + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} needs: - build - uses: - ./.github/workflows/gh-test.yml - with: - client-repo: ${{ github.event.repository.name }} - target-device: ${{ inputs.target-device }} - test-options: ${{ inputs.build-type }} - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} - build-type: ${{ inputs.build-type }} - host-platform: ${{ inputs.host-platform }} - dependencies-file: "" - build-mode: ${{ inputs.build-mode }} - upload-enabled: ${{ inputs.upload-enabled }} - python-version: ${{ inputs.python-version }} - has-built: ${{ needs.build.outputs.has-built }} - secrets: inherit + steps: + - name: Checkout ${{ github.event.repository.name }} + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + # TODO: we probably don't need this? + # - name: Setup + # if: ${{ inputs.has-built != 'true' }} + # uses: ./.github/actions/setup + # with: + # client-repo: ${{ github.event.repository.name }} + # build-type: ${{ inputs.build-type }} + # target-device: "${{ inputs.target-device }}" + # host-platform: ${{ inputs.host-platform }} + # build-mode: ${{ inputs.build-mode }} + # upload-enabled: ${{ inputs.upload-enabled }} + # python-version: ${{ inputs.python-version }} + + - name: Call test action + uses: ./.github/actions/test + with: + test-options: ${{ inputs.build-type }} diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml deleted file mode 100644 index 46026ba9..00000000 --- a/.github/workflows/gh-build.yml +++ /dev/null @@ -1,77 +0,0 @@ -name: Build - -on: - workflow_call: - inputs: - client-repo: - required: true - type: string - target-device: - required: true - type: string - runs-on: - required: true - type: string - build-type: - required: true - type: string - description: One of ci / release - host-platform: - required: true - type: string - dependencies-file: - required: true - type: string - description: path to versions.json relative to the target repo dir - build-mode: - required: true - type: string - upload-enabled: - required: true - type: boolean - python-version: - required: true - type: string - cuda-version: - required: true - type: string - outputs: - has-built: - value: true # TODO: we might need to check the job success here - description: whether the built stage was launched (and passed) - -jobs: - build: - name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}") - - permissions: - id-token: write # This is required for configure-aws-credentials - contents: read # This is required for actions/checkout - - runs-on: ${{ inputs.runs-on }} - - steps: - - name: Checkout ${{ inputs.client-repo }} - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up build environment - uses: ./.github/actions/setup - with: - client-repo: ${{ inputs.client-repo }} - build-type: ${{ inputs.build-type }} - target-device: "${{ inputs.target-device }}" - host-platform: ${{ inputs.host-platform }} - build-mode: ${{ inputs.build-mode }} - upload-enabled: ${{ inputs.upload-enabled }} - python-version: ${{ inputs.python-version }} - cuda-version: ${{ inputs.cuda-version }} - - - name: Call build action - uses: ./.github/actions/build - with: - build-type: ${{ inputs.build-type }} - target-device: "${{ inputs.target-device }}" - host-platform: ${{ inputs.host-platform }} - upload-enabled: ${{ inputs.upload-enabled }} diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml deleted file mode 100644 index 8216ce10..00000000 --- a/.github/workflows/gh-test.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: Test - -on: - workflow_call: - inputs: - client-repo: - required: true - type: string - target-device: - required: true - type: string - test-options: - required: true - type: string - runs-on: - required: true - type: string - build-type: - required: true - type: string - description: One of ci / release - host-platform: - required: true - type: string - dependencies-file: - required: true - type: string - description: path to versions.json relative to the target repo dir - build-mode: - required: true - type: string - upload-enabled: - required: true - type: boolean - python-version: - required: false - type: string - has-built: - required: false - type: string - description: whether the built stage was launched (and passed) - -jobs: - build: - name: Test (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} ) - - permissions: - id-token: write # This is required for configure-aws-credentials - contents: read # This is required for actions/checkout - - runs-on: ${{ inputs.runs-on }} - - container: - options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g - image: condaforge/miniforge3:latest - env: - NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - - steps: - - name: Checkout ${{ inputs.client-repo }} - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup - if: ${{ inputs.has-built != 'true' }} - uses: ./.github/actions/setup - with: - client-repo: ${{ inputs.client-repo }} - build-type: ${{ inputs.build-type }} - target-device: "${{ inputs.target-device }}" - host-platform: ${{ inputs.host-platform }} - build-mode: ${{ inputs.build-mode }} - upload-enabled: ${{ inputs.upload-enabled }} - python-version: ${{ inputs.python-version }} - - - name: Call test action - uses: ./.github/actions/test - with: - test-options: ${{ inputs.test-options }} From ca7b437189395454d8bc09ca5bae79b7862dff5c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 00:49:41 +0000 Subject: [PATCH 082/111] no need to pass secrets as we don't have reusable workflows anymore --- .github/workflows/gh-build-and-test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index f7296823..c9247f4e 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -34,7 +34,6 @@ jobs: (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || (inputs.host-platform == 'win-x64' && 'windows-2019') }} # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} - secrets: inherit steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@v4 @@ -71,7 +70,6 @@ jobs: id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} - secrets: inherit container: options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g image: condaforge/miniforge3:latest From 5d0b014250d5313d2dffc40a5aba0ece12e705d2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 01:13:24 +0000 Subject: [PATCH 083/111] pass job outputs explicitly... --- .github/workflows/ci-gh.yml | 3 ++- .github/workflows/gh-build-and-test.yml | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 31446beb..189258eb 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -1,3 +1,5 @@ +name: "CI" + concurrency: group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }} cancel-in-progress: true @@ -10,7 +12,6 @@ on: jobs: ci: - name: "CI" strategy: fail-fast: false matrix: diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index c9247f4e..a7d2919a 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -60,6 +60,19 @@ jobs: host-platform: ${{ inputs.host-platform }} upload-enabled: ${{ inputs.upload-enabled }} + - name: Pass environment variables + id: pass_env + run: | + echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT + echo "CUDA_CORE_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT + echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT + echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT + outputs: + CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }} + CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }} + CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} + CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} + test: # TODO: improve the name once a separate test matrix is defined name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }}) @@ -100,3 +113,8 @@ jobs: uses: ./.github/actions/test with: test-options: ${{ inputs.build-type }} + env: + CUDA_CORE_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_CORE_ARTIFACT_NAME }} + CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }} + CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} + CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} From 7350965130c2172d59657515c7a1e834e859198c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 01:16:35 +0000 Subject: [PATCH 084/111] try changing the order --- .github/workflows/gh-build-and-test.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index a7d2919a..5c1f1f29 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -34,6 +34,11 @@ jobs: (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || (inputs.host-platform == 'win-x64' && 'windows-2019') }} # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} + outputs: + CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }} + CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }} + CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} + CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@v4 @@ -65,13 +70,8 @@ jobs: run: | echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT echo "CUDA_CORE_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT - echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT - echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT - outputs: - CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }} - CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }} - CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} - CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} + echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_NAME}" >> $GITHUB_OUTPUT + echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_BINDINGS_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT test: # TODO: improve the name once a separate test matrix is defined From eedffd9776bfe63c3967f089803e480d3017c37d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 01:22:14 +0000 Subject: [PATCH 085/111] fix syntax --- .github/workflows/gh-build-and-test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 5c1f1f29..e7659a6f 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -35,10 +35,10 @@ jobs: (inputs.host-platform == 'win-x64' && 'windows-2019') }} # (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} outputs: - CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }} - CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }} - CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} - CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} + CUDA_CORE_ARTIFACT_NAME: ${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }} + CUDA_CORE_ARTIFACTS_DIR: ${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }} + CUDA_BINDINGS_ARTIFACT_NAME: ${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} + CUDA_BINDINGS_ARTIFACTS_DIR: ${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@v4 From c7d3e0322c2c9a077d8a657de2cf946345b0cd4e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 01:37:29 +0000 Subject: [PATCH 086/111] fix workflow merge error --- .github/actions/test/action.yml | 2 +- .github/workflows/ci-gh.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 675263fb..edfcee3b 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -41,4 +41,4 @@ runs: - name: Run test / analysis shell: bash --noprofile --norc -xeuo pipefail {0} run: | - "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/test" ${{ inputs.test-options }} + "./continuous_integration/scripts/entrypoint" "./continuous_integration/scripts/test" ${{ inputs.test-options }} diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 189258eb..cb27c879 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -34,6 +34,7 @@ jobs: # Note: this is for build-time only; the test-time matrix needs to be # defined separately. - "12.6.2" + name: "CI" uses: ./.github/workflows/gh-build-and-test.yml with: From 35c244f3833d9e749d414e22710e2c091330832c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:00:52 +0000 Subject: [PATCH 087/111] allow mini-ctk to be cached & reused in tests --- .github/actions/setup/action.yml | 40 ++++++++++++++++++++ .github/actions/test/action.yml | 49 ++++++++++++++++++++++++- .github/workflows/gh-build-and-test.yml | 2 + 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index e00cf27f..bed73c57 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -47,7 +47,21 @@ runs: run: | env + - name: Set up CTK cache variable + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV + + - name: Download CTK cache + id: ctk-get-cache + uses: actions/download-artifact@v4 + continue-on-error: true + with: + name: ${{ env.CTK_CACHE_FILENAME }} + path: . + - name: Get CUDA components + if: ${{ steps.ctk-get-cache.outcome == 'failure' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | CUDA_PATH="./cuda_toolkit" @@ -90,15 +104,41 @@ runs: } # Get headers and shared libraries in place + # Note: the existing artifact would need to be manually deleted (ex: through web UI) + # if this list is changed, as the artifact actions do not offer any option for us to + # invalidate the artifact. populate_cuda_path cuda_nvcc populate_cuda_path cuda_cudart populate_cuda_path cuda_nvrtc populate_cuda_path cuda_profiler_api ls -l $CUDA_PATH + # Prepare the cache + tar cf - $CUDA_PATH | xz -z -T0 - > $CTK_CACHE_FILENAME + # Note: the headers will be copied into the cibuildwheel manylinux container, # so setting the CUDA_PATH env var here is meaningless. + - name: Upload CTK cache + if: ${{ steps.ctk-get-cache.outcome == 'failure' }} + uses: actions/upload-artifact@v4 + with: + pattern: ${{ env.CTK_CACHE_FILENAME }} + path: . + if-no-files-found: error + + - name: Restore CTK cache + if: ${{ steps.ctk-get-cache.outcome == 'success' }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + CUDA_PATH="./cuda_toolkit" + mkdir $CUDA_PATH + tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + ls -l $CUDA_PATH + if [ ! -d "$CUDA_PATH/include" ]; then + exit 1 + fi + - name: Set environment variables shell: bash --noprofile --norc -xeuo pipefail {0} run: | diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index edfcee3b..0881b645 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -38,7 +38,54 @@ runs: pwd ls -lahR $CUDA_CORE_ARTIFACTS_DIR + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up CTK cache variable + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV + + - name: Download CTK cache + id: ctk-get-cache + uses: actions/download-artifact@v4 + continue-on-error: true + with: + name: ${{ env.CTK_CACHE_FILENAME }} + path: . + + - name: Restore CTK cache + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + CUDA_PATH="./cuda_toolkit" + mkdir $CUDA_PATH + tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + ls -l $CUDA_PATH + if [ ! -d "$CUDA_PATH/include" ]; then + exit 1 + fi + + # TODO: check if we really need these for tests? + echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV + echo "PATH=$PATH:$CUDA_PATH/bin" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib" >> $GITHUB_ENV + - name: Run test / analysis shell: bash --noprofile --norc -xeuo pipefail {0} run: | - "./continuous_integration/scripts/entrypoint" "./continuous_integration/scripts/test" ${{ inputs.test-options }} + REPO_DIR=$(pwd) + + cd "${CUDA_BINDINGS_ARTIFACTS_DIR}" + pip install *.whl + + cd "${CUDA_CORE_ARTIFACTS_DIR}" + pip install *.whl + + cd "${REPO_DIR}/cuda_bindings" + pytest tests/ + #pytest tests/cython + + cd "${REPO_DIR}/cuda_core" + pytest tests/ diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index e7659a6f..23663aad 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -83,6 +83,7 @@ jobs: id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} + # TODO: use a different (nvidia?) container, or just run on bare image container: options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g image: condaforge/miniforge3:latest @@ -118,3 +119,4 @@ jobs: CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }} CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} + PYTHON_VERSION: ${{ inputs-python-version }} From 5615d5429ddc0f1ba752260c432d12dddd8babb6 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:02:39 +0000 Subject: [PATCH 088/111] fix typo --- .github/workflows/gh-build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 23663aad..d8dce525 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -119,4 +119,4 @@ jobs: CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }} CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} - PYTHON_VERSION: ${{ inputs-python-version }} + PYTHON_VERSION: ${{ inputs.python-version }} From bb5fed32339d2e1679c97728ecc774af181bb6ff Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:17:22 +0000 Subject: [PATCH 089/111] try to escape | and > ... --- .github/actions/setup/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index bed73c57..567b2d25 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -114,7 +114,8 @@ runs: ls -l $CUDA_PATH # Prepare the cache - tar cf - $CUDA_PATH | xz -z -T0 - > $CTK_CACHE_FILENAME + # Note: try to escape | and > ... + echo "$(tar cf - ${CUDA_PATH} | xz -z -T0 - > ${CTK_CACHE_FILENAME})" # Note: the headers will be copied into the cibuildwheel manylinux container, # so setting the CUDA_PATH env var here is meaningless. From 9d6e69bae192998abe56b52f43511c87a42fdce0 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:36:25 +0000 Subject: [PATCH 090/111] switch to gz for simplicity --- .github/actions/setup/action.yml | 6 +++--- .github/actions/test/action.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 567b2d25..2908399f 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -50,7 +50,7 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache @@ -115,7 +115,7 @@ runs: # Prepare the cache # Note: try to escape | and > ... - echo "$(tar cf - ${CUDA_PATH} | xz -z -T0 - > ${CTK_CACHE_FILENAME})" + tar -czvf ${CTK_CACHE_FILENAME} ${CUDA_PATH} # Note: the headers will be copied into the cibuildwheel manylinux container, # so setting the CUDA_PATH env var here is meaningless. @@ -134,7 +134,7 @@ runs: run: | CUDA_PATH="./cuda_toolkit" mkdir $CUDA_PATH - tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 0881b645..559d05fe 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -46,7 +46,7 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache @@ -61,7 +61,7 @@ runs: run: | CUDA_PATH="./cuda_toolkit" mkdir $CUDA_PATH - tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 From 8ec609063e8c69966c9f06410f1b06e4692091d5 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:47:36 +0000 Subject: [PATCH 091/111] fix artifact parallel upload & lack of cache key --- .github/actions/setup/action.yml | 10 ++++++---- .github/actions/test/action.yml | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 2908399f..6c84d4c7 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -50,6 +50,7 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | + echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache @@ -57,8 +58,8 @@ runs: uses: actions/download-artifact@v4 continue-on-error: true with: - name: ${{ env.CTK_CACHE_FILENAME }} - path: . + name: ${{ env.CTK_CACHE_KEY }} + path: ./${{ env.CTK_CACHE_FILENAME }} - name: Get CUDA components if: ${{ steps.ctk-get-cache.outcome == 'failure' }} @@ -123,9 +124,10 @@ runs: - name: Upload CTK cache if: ${{ steps.ctk-get-cache.outcome == 'failure' }} uses: actions/upload-artifact@v4 + continue-on-error: true with: - pattern: ${{ env.CTK_CACHE_FILENAME }} - path: . + name: ${{ env.CTK_CACHE_KEY }} + path: ./${{ env.CTK_CACHE_FILENAME }} if-no-files-found: error - name: Restore CTK cache diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 559d05fe..4fab178f 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -46,6 +46,7 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | + echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache @@ -53,8 +54,8 @@ runs: uses: actions/download-artifact@v4 continue-on-error: true with: - name: ${{ env.CTK_CACHE_FILENAME }} - path: . + name: ${{ env.CTK_CACHE_KEY }} + path: ./${{ env.CTK_CACHE_FILENAME }} - name: Restore CTK cache shell: bash --noprofile --norc -xeuo pipefail {0} From 94561772a8ab10df6f8daac424eff8b085ae6b08 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 03:51:50 +0000 Subject: [PATCH 092/111] fix download path --- .github/actions/setup/action.yml | 3 ++- .github/actions/test/action.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 6c84d4c7..2c27ce94 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -59,7 +59,7 @@ runs: continue-on-error: true with: name: ${{ env.CTK_CACHE_KEY }} - path: ./${{ env.CTK_CACHE_FILENAME }} + path: ./ - name: Get CUDA components if: ${{ steps.ctk-get-cache.outcome == 'failure' }} @@ -136,6 +136,7 @@ runs: run: | CUDA_PATH="./cuda_toolkit" mkdir $CUDA_PATH + ls -l tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 4fab178f..a1480810 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -55,13 +55,14 @@ runs: continue-on-error: true with: name: ${{ env.CTK_CACHE_KEY }} - path: ./${{ env.CTK_CACHE_FILENAME }} + path: ./ - name: Restore CTK cache shell: bash --noprofile --norc -xeuo pipefail {0} run: | CUDA_PATH="./cuda_toolkit" mkdir $CUDA_PATH + ls -l tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then From 2f2046d1e5f7f149fa756e7dbcc6468c81cfe98f Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 04:02:19 +0000 Subject: [PATCH 093/111] fix extract --- .github/actions/setup/action.yml | 5 ++--- .github/actions/test/action.yml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 2c27ce94..22cd8121 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -134,10 +134,9 @@ runs: if: ${{ steps.ctk-get-cache.outcome == 'success' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | - CUDA_PATH="./cuda_toolkit" - mkdir $CUDA_PATH ls -l - tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + CUDA_PATH="./cuda_toolkit" + tar -xzvf $CTK_CACHE_FILENAME ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index a1480810..e013b1d2 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -60,10 +60,9 @@ runs: - name: Restore CTK cache shell: bash --noprofile --norc -xeuo pipefail {0} run: | - CUDA_PATH="./cuda_toolkit" - mkdir $CUDA_PATH ls -l - tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1 + CUDA_PATH="./cuda_toolkit" + tar -xzvf $CTK_CACHE_FILENAME ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 From c361ad8f752af8a6e0a30b3299937b7a86c3244a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 04:13:46 +0000 Subject: [PATCH 094/111] propagate cuda-version --- .github/actions/test/action.yml | 4 ++-- .github/workflows/gh-build-and-test.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index e013b1d2..2703af16 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -46,8 +46,8 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV - echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV + echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index d8dce525..556c47f2 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -120,3 +120,4 @@ jobs: CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }} CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} PYTHON_VERSION: ${{ inputs.python-version }} + CTK_BUILD_VER: ${{ inputs.cuda-version }} From 37a3bb15f92adc6365545a72e429a4388caaba79 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 23:26:36 -0500 Subject: [PATCH 095/111] install binding test deps --- .github/actions/test/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 2703af16..1328cbf2 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -85,6 +85,7 @@ runs: pip install *.whl cd "${REPO_DIR}/cuda_bindings" + pip install -r requirements.txt pytest tests/ #pytest tests/cython From 54707dcadbd55d263f2f9300f48d642989470434 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 23:41:12 -0500 Subject: [PATCH 096/111] fix paths --- .github/actions/test/action.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 1328cbf2..a04e9ccf 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -61,14 +61,13 @@ runs: shell: bash --noprofile --norc -xeuo pipefail {0} run: | ls -l - CUDA_PATH="./cuda_toolkit" + CUDA_PATH="$(pwd)/cuda_toolkit" tar -xzvf $CTK_CACHE_FILENAME ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 fi - # TODO: check if we really need these for tests? echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV echo "PATH=$PATH:$CUDA_PATH/bin" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib" >> $GITHUB_ENV From 3faf8a3da359d8af3483226b5dee33eb330f0afd Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Dec 2024 23:56:05 -0500 Subject: [PATCH 097/111] include nvjitlink to mini CTK for testing --- .github/actions/setup/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 22cd8121..bed9a63d 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -112,6 +112,7 @@ runs: populate_cuda_path cuda_cudart populate_cuda_path cuda_nvrtc populate_cuda_path cuda_profiler_api + populate_cuda_path libnvjitlink ls -l $CUDA_PATH # Prepare the cache From e35706aa13c699c91e2b6076310ba75e4dc2228b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 05:19:14 +0000 Subject: [PATCH 098/111] ensure cupy is an optional test dependency --- cuda_core/tests/example_tests/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py index 3d218a91..81479903 100644 --- a/cuda_core/tests/example_tests/utils.py +++ b/cuda_core/tests/example_tests/utils.py @@ -10,7 +10,6 @@ import os import sys -import cupy as cp import pytest @@ -53,4 +52,3 @@ def run_example(samples_path, filename, env=None): sys.argv = old_argv # further reduce the memory watermark gc.collect() - cp.get_default_memory_pool().free_all_blocks() From e0c610e08ec20fa94f3d6c2fa39964c5597fa15a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 05:31:07 +0000 Subject: [PATCH 099/111] per arch ctk --- .github/actions/setup/action.yml | 4 ++-- .github/actions/test/action.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index bed9a63d..2416fcf4 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -50,8 +50,8 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV - echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV + echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index a04e9ccf..0a1e621a 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -46,8 +46,8 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}" >> $GITHUB_ENV - echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}.tar.gz" >> $GITHUB_ENV + echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache From fb487d0fedb9b91ef2d73024d407a9384aa657d3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 05:41:57 +0000 Subject: [PATCH 100/111] fix arg passing --- .github/actions/test/action.yml | 4 ++-- .github/workflows/gh-build-and-test.yml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 0a1e621a..c82a8450 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -46,8 +46,8 @@ runs: - name: Set up CTK cache variable shell: bash --noprofile --norc -xeuo pipefail {0} run: | - echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}" >> $GITHUB_ENV - echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV + echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${HOST_PLATFORM}" >> $GITHUB_ENV + echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${HOST_PLATFORM}.tar.gz" >> $GITHUB_ENV - name: Download CTK cache id: ctk-get-cache diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 556c47f2..06f6a168 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -75,7 +75,7 @@ jobs: test: # TODO: improve the name once a separate test matrix is defined - name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }}) + name: Test (CUDA ${{ inputs.cuda-version }}) # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up if: ${{ (github.repository_owner == 'nvidia') && startsWith(inputs.host-platform, 'linux-x64') }} @@ -121,3 +121,4 @@ jobs: CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }} PYTHON_VERSION: ${{ inputs.python-version }} CTK_BUILD_VER: ${{ inputs.cuda-version }} + HOST_PLATFORM: ${{ inputs.host-platform }} From c01e015c26d8d7f0e775865559d69e8aa3d7e823 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 05:55:48 +0000 Subject: [PATCH 101/111] fix invalid context during test teardown --- cuda_core/tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 59e5883f..b67eeec2 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -30,6 +30,10 @@ def init_cuda(): def _device_unset_current(): + ctx = handle_return(driver.cuCtxGetCurrent()) + if int(ctx) == 0: + # no active context, do nothing + return handle_return(driver.cuCtxPopCurrent()) with _device._tls_lock: del _device._tls.devices From f1d0e4027231adfbd44a6e70070d939e5e129d97 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 06:09:32 +0000 Subject: [PATCH 102/111] WAR: mark PTX test xfail due to CI condition --- cuda_core/tests/test_program.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 95c4d377..562f89de 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -8,10 +8,20 @@ import pytest +from cuda import cuda, nvrtc from cuda.core.experimental import Program from cuda.core.experimental._module import Kernel, ObjectCode +@pytest.fixture +def can_load_generated_ptx(): + _, driver_ver = cuda.cuDriverGetVersion() + _, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion() + if nvrtc_major * 1000 + nvrtc_minor * 10 > driver_ver: + return False + return True + + def test_program_init_valid_code_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") @@ -31,6 +41,8 @@ def test_program_init_invalid_code_format(): Program(code, "c++") +# TODO: incorporate this check in Program +@pytest.mark.xfail(not can_load_generated_ptx, reason="PTX version too new") def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") From a08fbc94bb811298a3d9a190bd7d8cc6c72c1fbc Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 01:34:16 -0500 Subject: [PATCH 103/111] debug --- .github/actions/test/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index c82a8450..78572917 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -89,4 +89,4 @@ runs: #pytest tests/cython cd "${REPO_DIR}/cuda_core" - pytest tests/ + pytest -rxXs tests/ From f36393e65c80d667f9eeed311d52038f0026da7c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 15:15:13 +0000 Subject: [PATCH 104/111] also detect if CUDA is ever initialized --- cuda_core/tests/conftest.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index b67eeec2..9c8ed52b 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -18,7 +18,7 @@ import pytest from cuda.core.experimental import Device, _device -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils import CUDAError, handle_return @pytest.fixture(scope="function") @@ -30,10 +30,15 @@ def init_cuda(): def _device_unset_current(): - ctx = handle_return(driver.cuCtxGetCurrent()) - if int(ctx) == 0: - # no active context, do nothing - return + try: + ctx = handle_return(driver.cuCtxGetCurrent()) + except CUDAError as e: + if "CUDA_ERROR_NOT_INITIALIZED" in str(e): + return + else: + if int(ctx) == 0: + # no active context, do nothing + return handle_return(driver.cuCtxPopCurrent()) with _device._tls_lock: del _device._tls.devices From f3cc6bde2b575bb9f1a4bfe785e345cf57a575a1 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 16:26:34 +0000 Subject: [PATCH 105/111] ensure CUDA is init'd at test start time --- cuda_core/tests/conftest.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 9c8ed52b..58cc7cef 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -21,8 +21,14 @@ from cuda.core.experimental._utils import CUDAError, handle_return +@pytest.fixture(scope="session", autouse=True) +def always_init_cuda(): + handle_return(driver.cuInit(0)) + + @pytest.fixture(scope="function") def init_cuda(): + # TODO: rename this to e.g. init_context device = Device() device.set_current() yield @@ -30,15 +36,10 @@ def init_cuda(): def _device_unset_current(): - try: - ctx = handle_return(driver.cuCtxGetCurrent()) - except CUDAError as e: - if "CUDA_ERROR_NOT_INITIALIZED" in str(e): - return - else: - if int(ctx) == 0: - # no active context, do nothing - return + ctx = handle_return(driver.cuCtxGetCurrent()) + if int(ctx) == 0: + # no active context, do nothing + return handle_return(driver.cuCtxPopCurrent()) with _device._tls_lock: del _device._tls.devices @@ -46,6 +47,7 @@ def _device_unset_current(): @pytest.fixture(scope="function") def deinit_cuda(): + # TODO: rename this to e.g. deinit_context yield _device_unset_current() From b1f07a38f60c5aa70cd71048b79b7efe6b934094 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 16:41:15 +0000 Subject: [PATCH 106/111] enforce the right CC is passed to NVRTC --- cuda_core/tests/conftest.py | 2 +- cuda_core/tests/test_program.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 58cc7cef..fe755738 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -18,7 +18,7 @@ import pytest from cuda.core.experimental import Device, _device -from cuda.core.experimental._utils import CUDAError, handle_return +from cuda.core.experimental._utils import handle_return @pytest.fixture(scope="session", autouse=True) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 562f89de..10789856 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -9,7 +9,7 @@ import pytest from cuda import cuda, nvrtc -from cuda.core.experimental import Program +from cuda.core.experimental import Device, Program from cuda.core.experimental._module import Kernel, ObjectCode @@ -46,7 +46,9 @@ def test_program_init_invalid_code_format(): def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") - object_code = program.compile("ptx") + arch = "".join(str(i) for i in Device().compute_capability) + object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",)) + print(object_code._module.decode()) kernel = object_code.get_kernel("my_kernel") assert isinstance(object_code, ObjectCode) assert isinstance(kernel, Kernel) From 8a6738b3aa3d265933a68f58b3b891ee8b7c196a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 12:10:57 -0500 Subject: [PATCH 107/111] fix xfail mark --- cuda_core/tests/test_program.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 10789856..f1c24b3e 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -13,7 +13,6 @@ from cuda.core.experimental._module import Kernel, ObjectCode -@pytest.fixture def can_load_generated_ptx(): _, driver_ver = cuda.cuDriverGetVersion() _, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion() @@ -42,7 +41,7 @@ def test_program_init_invalid_code_format(): # TODO: incorporate this check in Program -@pytest.mark.xfail(not can_load_generated_ptx, reason="PTX version too new") +@pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") From ed0386a33d9273e766063813c85828b0b5ffda54 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 22:42:25 +0000 Subject: [PATCH 108/111] switch to use github cache to improve reuse --- .github/actions/setup/action.yml | 19 +++++++++---------- .github/actions/test/action.yml | 7 ++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 2416fcf4..084f4c2f 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -55,14 +55,14 @@ runs: - name: Download CTK cache id: ctk-get-cache - uses: actions/download-artifact@v4 + uses: actions/cache/restore@v4 continue-on-error: true with: - name: ${{ env.CTK_CACHE_KEY }} - path: ./ + key: ${{ env.CTK_CACHE_KEY }} + path: ./${{ env.CTK_CACHE_FILENAME }} - name: Get CUDA components - if: ${{ steps.ctk-get-cache.outcome == 'failure' }} + if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | CUDA_PATH="./cuda_toolkit" @@ -123,16 +123,15 @@ runs: # so setting the CUDA_PATH env var here is meaningless. - name: Upload CTK cache - if: ${{ steps.ctk-get-cache.outcome == 'failure' }} - uses: actions/upload-artifact@v4 - continue-on-error: true + if: ${{ always() && + steps.ctk-get-cache.outputs.cache-hit != 'true' }} + uses: actions/cache/save@v4 with: - name: ${{ env.CTK_CACHE_KEY }} + key: ${{ env.CTK_CACHE_KEY }} path: ./${{ env.CTK_CACHE_FILENAME }} - if-no-files-found: error - name: Restore CTK cache - if: ${{ steps.ctk-get-cache.outcome == 'success' }} + if: ${{ steps.ctk-get-cache.outputs.cache-hit == 'true' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | ls -l diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 78572917..66468bd1 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -51,11 +51,12 @@ runs: - name: Download CTK cache id: ctk-get-cache - uses: actions/download-artifact@v4 + uses: actions/cache/restore@v4 continue-on-error: true with: - name: ${{ env.CTK_CACHE_KEY }} - path: ./ + key: ${{ env.CTK_CACHE_KEY }} + path: ./${{ env.CTK_CACHE_FILENAME }} + fail-on-cache-miss: true - name: Restore CTK cache shell: bash --noprofile --norc -xeuo pipefail {0} From 7b074f03794bf3cb2b6cabdf2d77fd7860cbcb86 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 8 Dec 2024 22:54:45 +0000 Subject: [PATCH 109/111] clean up legacy CI scripts --- continuous_integration/environment.yml | 24 --- continuous_integration/scripts/build | 32 ---- continuous_integration/scripts/conda-utils | 16 -- continuous_integration/scripts/entrypoint | 20 -- .../scripts/generate-environment | 36 ---- continuous_integration/scripts/make-conda-env | 27 --- continuous_integration/scripts/setup-utils | 179 ------------------ continuous_integration/scripts/test | 38 ---- 8 files changed, 372 deletions(-) delete mode 100644 continuous_integration/environment.yml delete mode 100755 continuous_integration/scripts/build delete mode 100755 continuous_integration/scripts/conda-utils delete mode 100755 continuous_integration/scripts/entrypoint delete mode 100755 continuous_integration/scripts/generate-environment delete mode 100755 continuous_integration/scripts/make-conda-env delete mode 100755 continuous_integration/scripts/setup-utils delete mode 100755 continuous_integration/scripts/test diff --git a/continuous_integration/environment.yml b/continuous_integration/environment.yml deleted file mode 100644 index 6d922d43..00000000 --- a/continuous_integration/environment.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: cuda_python -channels: - - defaults -dependencies: - - python>=3.10 - - cython>=3.0.0 - - pytest>=6.2.4 - - numpy>=1.21.1 - - setuptools - - wheel - - pip - - cuda-version=12.6 - - cuda-cudart-static - - cuda-driver-dev - - cuda-cudart-dev - - cuda-profiler-api - - cuda-nvrtc-dev - - cuda-nvcc - - pip: - - pytest-benchmark>=3.4.1 - - pyclibrary>=0.1.7 - - versioneer==0.29 - - tomli; python_version < "3.11" - - pywin32; sys_platform == 'win32' diff --git a/continuous_integration/scripts/build b/continuous_integration/scripts/build deleted file mode 100755 index 618edd5f..00000000 --- a/continuous_integration/scripts/build +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -build_ci() { - set -xeou pipefail - - export CUDA_HOME="${CONDA_PREFIX}/targets/x86_64-linux" - export PARALLEL_LEVEL=$(nproc --ignore 1) - - cd "${REPO_DIR}/cuda_bindings" - python setup.py bdist_wheel - - cd "${REPO_DIR}/cuda_core" - python setup.py bdist_wheel -} - -build_project() { - set -xeou pipefail - - export PYTHONUNBUFFERED=1 - - . setup-utils; - init_build_env "$@"; - - git config --global --add safe.directory "$REPO_DIR/.git" - - case "${BUILD_TYPE}" in - ci) build_ci;; - *) return 1;; - esac -} - -(build_project "$@"); diff --git a/continuous_integration/scripts/conda-utils b/continuous_integration/scripts/conda-utils deleted file mode 100755 index e0dd32ca..00000000 --- a/continuous_integration/scripts/conda-utils +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -activate_conda_env() { - set +xu - eval "$(conda shell.bash hook)" - conda activate "${CONDA_ENV}"; - set -xu - : ${PYTHON_VERSION:=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")} - export PYTHON_VERSION -} - -conda_info() { - set +x - conda info - set -x -} diff --git a/continuous_integration/scripts/entrypoint b/continuous_integration/scripts/entrypoint deleted file mode 100755 index fe4f5cea..00000000 --- a/continuous_integration/scripts/entrypoint +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -set_initial_env() { - set -xeuo pipefail - - export PATH="${PATH}:${REPO_DIR}/continuous_integration/scripts" -} - -entrypoint() { - set -xeuo pipefail - set_initial_env; - - git config --global --add safe.directory "$REPO_DIR/.git" - - cd "${REPO_DIR}" - - exec "$@"; -} - -entrypoint "$@"; diff --git a/continuous_integration/scripts/generate-environment b/continuous_integration/scripts/generate-environment deleted file mode 100755 index 8bf2c38d..00000000 --- a/continuous_integration/scripts/generate-environment +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# Function to generate environment.yml -generate_environment_yml() { - local python_version=$1 - local cuda_version=$2 - local output_path=$3 - - cat < "${output_path}/environment.yml" -name: cuda_python -channels: - - defaults - - conda-forge -dependencies: - - python=${python_version} - - cython - - pytest - - numpy - - setuptools - - wheel - - pip - - cuda-version=${cuda_version} - - cuda-cudart-static - - cuda-driver-dev - - cuda-cudart-dev - - cuda-profiler-api - - cuda-nvrtc-dev - - cuda-nvcc - - pip: - - pytest-benchmark - - pyclibrary - - versioneer==0.29 - - tomli; python_version < "3.11" - - pywin32; sys_platform == 'win32' -EOF -} \ No newline at end of file diff --git a/continuous_integration/scripts/make-conda-env b/continuous_integration/scripts/make-conda-env deleted file mode 100755 index 37539b37..00000000 --- a/continuous_integration/scripts/make-conda-env +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash - -set -x - -make_ci_env() { - #TODO wire cuda version as a top level matrix argument - generate_environment_yml "${PYTHON_VERSION}" 12.6 . - mamba env create -n "${CONDA_ENV}" -f ./environment.yml -} - -make_conda_env() { - set -xeuo pipefail - - . setup-utils; - . generate-environment - set_base_defs; - - case "$1" in - ci) make_ci_env;; - test) make_test_env;; - *) return 1;; - esac - - return 0; -} - -(make_conda_env "$@"); diff --git a/continuous_integration/scripts/setup-utils b/continuous_integration/scripts/setup-utils deleted file mode 100755 index f8faefa4..00000000 --- a/continuous_integration/scripts/setup-utils +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env bash - -install_from_apt() { - set -xeuo pipefail - - export DEBIAN_FRONTEND=non-interactive - - # Run package updates and install packages - apt-get -q update - apt-get -q install -y wget curl jq sudo ninja-build vim numactl rsync -} - -install_cmake() { - set -xeuo pipefail - - wget -q https://github.com/Kitware/CMake/releases/download/v3.26.5/cmake-3.26.5-linux-x86_64.tar.gz - - tar -xzf cmake-3.26.5-linux-x86_64.tar.gz -} - -setup_linux_build_env() { - set -xeuo pipefail - export OS_SHORT_NAME=linux - export PATH="${PATH}:${PREBUILD_DIR}/cmake-3.26.5-linux-x86_64/bin" - - mkdir -p /tmp/out /tmp/env_yaml -} - -install_linux_tools() { - set -xeuo pipefail - - export SED=sed - export READLINK=readlink - - install_from_apt; - install_cmake; - - mkdir -p /tmp/out /tmp/env_yaml -} - -install_linux_test_tools() { - set -xeuo pipefail - - export SED=sed - export READLINK=readlink - - # Run package updates and install packages - apt-get -q update - apt-get -q install -y numactl -} - -set_base_defs() { - set -xeuo pipefail - - export CONDA_ENV=cuda_python - - CONDA_PLATFORM=$(conda info | grep 'platform' | awk -F ' : ' '{print $2}') - export CONDA_PLATFORM - - export PREBUILD_DIR=/tmp/prebuild - mkdir -p "$PREBUILD_DIR" - - export BUILD_DIR="$REPO_DIR/build" - - # Get the machine architecture - ARCH=$(uname -m) - - if [ "$ARCH" == "aarch64" ]; then - # Use the gcc march value used by aarch64 Ubuntu. - BUILD_MARCH=armv8-a - else - # Use uname -m otherwise - BUILD_MARCH=$(uname -m | tr '_' '-') - fi - - export BUILD_MARCH - - export CUDA_VERSION=12.2.2 - - export MAX_LIBSANITIZER_VERSION=11.4 - - export USE_OPENMP=ON -} - -# ----------------------------------------------------------------------------- - -prep_git() { - # Temporarily disable exit on error - set +e - git config --global user.email > /dev/null - local email_exit_status=$? - git config --global user.name > /dev/null - local name_exit_status=$? - # Re-enable exit on error - set -e - - if [ $email_exit_status -ne 0 ]; then - git config --global --add user.email "users.noreply.github.com" - echo "git user.email was not set. It's now set to users.noreply.github.com" - else - echo "Note: git user.email is already set." - fi - - if [ $name_exit_status -ne 0 ]; then - git config --global --add user.name "anon" - echo "git user.name was not set. It's now set to anon" - else - echo "Note: git user.name is already set." - fi - - # Fix "fatal: detected dubious ownership in repository at '/tmp/legate.core'" - # during local builds. - git config --global --add safe.directory "$REPO_DIR" -} - - -setup_build_env() { - set -xeuo pipefail - - install_linux_tools; - - setup_linux_build_env; - - rm -rf "$PREBUILD_DIR" - mkdir -p "$PREBUILD_DIR" - cd $PREBUILD_DIR - - prep_git; -} - -init_build_env() { - set -x; - - . conda-utils; - - export BUILD_TYPE=$1 - - set -xeuo pipefail; - - set_base_defs; - - cd "$PREBUILD_DIR" - - setup_build_env; - - cd "$REPO_DIR"; - - if [[ -d "${BUILD_DIR}" ]]; then - rm -rf "${BUILD_DIR}" - fi - - make-conda-env "$BUILD_TYPE"; - - activate_conda_env; - conda_info; -} - -init_test_env() { - set -x; - - . conda-utils; - - export TEST_TYPE=$1 - - set -xeuo pipefail; - - set_base_defs; - - cd "$PREBUILD_DIR" - - # setup_test_env; - - cd "$REPO_DIR"; - - make-conda-env "$TEST_TYPE"; - - activate_conda_env; - conda_info; -} \ No newline at end of file diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test deleted file mode 100755 index 3a705c3c..00000000 --- a/continuous_integration/scripts/test +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -test_ci() { - set -xeou pipefail - - activate_conda_env; - - cd "${BINDINGS_ARTIFACTS_DIR}" - pip install *.whl - - cd "${CORE_ARTIFACTS_DIR}" - pip install *.whl - - cd "${REPO_DIR}/cuda_core" - python -m pytest tests/ - - cd "${REPO_DIR}/cuda_bindings" - python -m pytest tests/ - -} - -test_project() { - set -xeou pipefail - - export PYTHONUNBUFFERED=1 - - . setup-utils; - init_test_env "$@"; - - git config --global --add safe.directory "$REPO_DIR/.git" - - case "${TEST_TYPE}" in - ci) test_ci;; - *) return 1;; - esac -} - -(test_project "$@"); From 6a595945a4a25f35cc4cfd312ac797fd30bc435c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 9 Dec 2024 12:06:56 -0800 Subject: [PATCH 110/111] fix build warning and output format of docs --- cuda_core/docs/source/conf.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 3a7afc09..4b3e17ae 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -93,15 +93,26 @@ napoleon_numpy_docstring = True +section_titles = ["Returns"] def autodoc_process_docstring(app, what, name, obj, options, lines): if name.startswith("cuda.core.experimental.system"): - # patch the docstring (in lines) *in-place* + # patch the docstring (in lines) *in-place*. Should docstrings include section titles other than "Returns", + # this will need to be modified to handle them. attr = name.split(".")[-1] from cuda.core.experimental._system import System lines_new = getattr(System, attr).__doc__.split("\n") + formatted_lines = [] + for line in lines_new: + title = line.strip() + if title in section_titles: + formatted_lines.append(line.replace(title, f".. rubric:: {title}")) + elif line.strip() == "-" * len(title): + formatted_lines.append(" " * len(title)) + else: + formatted_lines.append(line) n_pops = len(lines) - lines.extend(lines_new) + lines.extend(formatted_lines) for _ in range(n_pops): lines.pop(0) From 769ac6679e06b10eb609d8df1cfb19fb58cdf9c4 Mon Sep 17 00:00:00 2001 From: Keenan Simpson Date: Mon, 9 Dec 2024 12:10:13 -0800 Subject: [PATCH 111/111] Update cuda_core/docs/source/api_private.rst Co-authored-by: Leo Fang --- cuda_core/docs/source/api_private.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index a833d69c..f100eb7c 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -16,7 +16,6 @@ CUDA runtime _memory.Buffer _stream.Stream _event.Event - _system.System CUDA compilation toolchain