From 9d8ecbc19d3cef81ab877890b00e912a548ddd6b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:06:26 -0800
Subject: [PATCH 001/111] integrate ruff changes

---
 cuda_core/cuda/core/experimental/__init__.py |   1 +
 cuda_core/cuda/core/experimental/_linker.py  | 285 +++++++++++++++++++
 cuda_core/docs/source/api.rst                |   5 +
 cuda_core/docs/source/release.md             |   1 +
 cuda_core/docs/source/release/0.1.0-notes.md |   4 +-
 cuda_core/docs/source/release/0.2.0-notes.md |  11 +
 cuda_core/tests/test_linker.py               | 101 +++++++
 7 files changed, 406 insertions(+), 2 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_linker.py
 create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md
 create mode 100644 cuda_core/tests/test_linker.py
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 9b978398..12fed225 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -5,5 +5,6 @@
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
+from cuda.core.experimental._linker import Linker, LinkerOptions
 from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
new file mode 100644
index 00000000..e80bfe61
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -0,0 +1,285 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+from cuda.bindings import nvjitlink
+from cuda.core.experimental._module import ObjectCode
+from cuda.core.experimental._utils import check_or_create_options
+
+
+@dataclass
+class LinkerOptions:
+    """Customizable :obj:`LinkerOptions` for nvJitLink.
+
+    Attributes
+    ----------
+    arch : str
+        Pass SM architecture value. Can use compute_<N> value instead if only generating PTX.
+        This is a required option.
+        Acceptable value type: str
+        Maps to: -arch=sm_<N>
+    max_register_count : int, optional
+        Maximum register count.
+        Default: None
+        Acceptable value type: int
+        Maps to: -maxrregcount=<N>
+    time : bool, optional
+        Print timing information to InfoLog.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -time
+    verbose : bool, optional
+        Print verbose messages to InfoLog.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -verbose
+    link_time_optimization : bool, optional
+        Perform link time optimization.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -lto
+    ptx : bool, optional
+        Emit PTX after linking instead of CUBIN; only supported with -lto.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -ptx
+    optimization_level : int, optional
+        Set optimization level. Only 0 and 3 are accepted.
+        Default: None
+        Acceptable value type: int
+        Maps to: -O<N>
+    debug : bool, optional
+        Generate debug information.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -g
+    lineinfo : bool, optional
+        Generate line information.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -lineinfo
+    ftz : bool, optional
+        Flush denormal values to zero.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -ftz=<n>
+    prec_div : bool, optional
+        Use precise division.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -prec-div=<n>
+    prec_sqrt : bool, optional
+        Use precise square root.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -prec-sqrt=<n>
+    fma : bool, optional
+        Use fast multiply-add.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -fma=<n>
+    kernels_used : List[str], optional
+        Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
+        times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -kernels-used=<name>
+    variables_used : List[str], optional
+        Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple
+        times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -variables-used=<name>
+    optimize_unused_variables : bool, optional
+        Assume that if a variable is not referenced in device code, it can be removed.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -optimize-unused-variables
+    xptxas : List[str], optional
+        Pass options to PTXAS. This option can be called multiple times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -Xptxas=<opt>
+    split_compile : int, optional
+        Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
+        compilation (default).
+        Default: 1
+        Acceptable value type: int
+        Maps to: -split-compile=<N>
+    split_compile_extended : int, optional
+        A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
+        Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
+        option can potentially impact performance of the compiled binary.
+        Default: 1
+        Acceptable value type: int
+        Maps to: -split-compile-extended=<N>
+    jump_table_density : int, optional
+        When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to
+        determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value
+        is 101. The percentage ranges from 0 to 101 inclusively.
+        Default: 101
+        Acceptable value type: int
+        Maps to: -jump-table-density=<N>
+    no_cache : bool, optional
+        Do not cache the intermediate steps of nvJitLink.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -no-cache
+    device_stack_protector : bool, optional
+        Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory
+        safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in
+        each function. Only those functions which are deemed high-risk make use of a stack canary.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -device-stack-protector
+    """
+
+    arch: str
+    max_register_count: Optional[int] = None
+    time: Optional[bool] = None
+    verbose: Optional[bool] = None
+    link_time_optimization: Optional[bool] = None
+    ptx: Optional[bool] = None
+    optimization_level: Optional[int] = None
+    debug: Optional[bool] = None
+    lineinfo: Optional[bool] = None
+    ftz: Optional[bool] = None
+    prec_div: Optional[bool] = None
+    prec_sqrt: Optional[bool] = None
+    fma: Optional[bool] = None
+    kernels_used: Optional[List[str]] = None
+    variables_used: Optional[List[str]] = None
+    optimize_unused_variables: Optional[bool] = None
+    xptxas: Optional[List[str]] = None
+    split_compile: Optional[int] = None
+    split_compile_extended: Optional[int] = None
+    jump_table_density: Optional[int] = None
+    no_cache: Optional[bool] = None
+    device_stack_protector: Optional[bool] = None
+
+    def __post_init__(self):
+        self.formatted_options = []
+        if self.arch is not None:
+            self.formatted_options.append(f"-arch={self.arch}")
+        if self.max_register_count is not None:
+            self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
+        if self.time is not None:
+            self.formatted_options.append("-time")
+        if self.verbose is not None:
+            self.formatted_options.append("-verbose")
+        if self.link_time_optimization is not None:
+            self.formatted_options.append("-lto")
+        if self.ptx is not None:
+            self.formatted_options.append("-ptx")
+        if self.optimization_level is not None:
+            self.formatted_options.append(f"-O{self.optimization_level}")
+        if self.debug is not None:
+            self.formatted_options.append("-g")
+        if self.lineinfo is not None:
+            self.formatted_options.append("-lineinfo")
+        if self.ftz is not None:
+            self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}")
+        if self.prec_div is not None:
+            self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}")
+        if self.prec_sqrt is not None:
+            self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}")
+        if self.fma is not None:
+            self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}")
+        if self.kernels_used is not None:
+            for kernel in self.kernels_used:
+                self.formatted_options.append(f"-kernels-used={kernel}")
+        if self.variables_used is not None:
+            for variable in self.variables_used:
+                self.formatted_options.append(f"-variables-used={variable}")
+        if self.optimize_unused_variables is not None:
+            self.formatted_options.append("-optimize-unused-variables")
+        if self.xptxas is not None:
+            for opt in self.xptxas:
+                self.formatted_options.append(f"-Xptxas={opt}")
+        if self.split_compile is not None:
+            self.formatted_options.append(f"-split-compile={self.split_compile}")
+        if self.split_compile_extended is not None:
+            self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
+        if self.jump_table_density is not None:
+            self.formatted_options.append(f"-jump-table-density={self.jump_table_density}")
+        if self.no_cache is not None:
+            self.formatted_options.append("-no-cache")
+        if self.device_stack_protector is not None:
+            self.formatted_options.append("-device-stack-protector")
+
+
+class Linker:
+    __slots__ = "_handle"
+
+    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
+        self._handle = None
+        options = check_or_create_options(LinkerOptions, options, "Linker options")
+        self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
+
+        if object_codes is not None:
+            for code in object_codes:
+                assert isinstance(code, ObjectCode)
+                self._add_code_object(code)
+
+    def _add_code_object(self, object_code: ObjectCode):
+        data = object_code._module
+        assert isinstance(data, bytes)
+        nvjitlink.add_data(
+            self._handle,
+            self._input_type_from_code_type(object_code._code_type),
+            data,
+            len(data),
+            f"{object_code._handle}_{object_code._code_type}",
+        )
+
+    def link(self, target_type) -> ObjectCode:
+        nvjitlink.complete(self._handle)
+        if target_type not in ["cubin", "ptx"]:
+            raise ValueError(f"Unsupported target type: {target_type}")
+        code = None
+        if target_type == "cubin":
+            cubin_size = nvjitlink.get_linked_cubin_size(self._handle)
+            code = bytearray(cubin_size)
+            nvjitlink.get_linked_cubin(self._handle, code)
+        else:
+            ptx_size = nvjitlink.get_linked_ptx_size(self._handle)
+            code = bytearray(ptx_size)
+            nvjitlink.get_linked_ptx(self._handle, code)
+
+        return ObjectCode(bytes(code), target_type)
+
+    def get_error_log(self) -> str:
+        log_size = nvjitlink.get_error_log_size(self._handle)
+        log = bytearray(log_size)
+        nvjitlink.get_error_log(self._handle, log)
+        return log.decode()
+
+    def get_info_log(self) -> str:
+        log_size = nvjitlink.get_info_log_size(self._handle)
+        log = bytearray(log_size)
+        nvjitlink.get_info_log(self._handle, log)
+        return log.decode()
+
+    def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
+        # this list is based on the supported values for code_type in the ObjectCode class definition.
+        # nvjitlink supports other options for input type
+        if code_type == "ptx":
+            return nvjitlink.InputType.PTX
+        elif code_type == "cubin":
+            return nvjitlink.InputType.CUBIN
+        elif code_type == "fatbin":
+            return nvjitlink.InputType.FATBIN
+        elif code_type == "ltoir":
+            return nvjitlink.InputType.LTOIR
+        elif code_type == "object":
+            return nvjitlink.InputType.OBJECT
+        else:
+            raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    def __del__(self):
+        if self._handle is not None:
+            nvjitlink.destroy(self._handle)
+            self._handle = None
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 1cb9811b..e10b36a8 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -31,3 +31,8 @@ CUDA compilation toolchain
    :toctree: generated/
 
    Program
+   Linker
+
+   :template: dataclass.rst
+
+   LinkerOptions
\ No newline at end of file
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e24786..4c615eb3 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -6,4 +6,5 @@ maxdepth: 3
 ---
 
     0.1.0 <release/0.1.0-notes>
+    0.2.0 <release/0.2.0-notes>
 ```
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md
index 2131ed90..1ebb41f9 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.md
@@ -1,9 +1,9 @@
 # `cuda.core` Release notes
 
-Released on Nov 8, 2024
+Released on Nov XX, 2024
 
 ## Hightlights
-- Initial beta release
+- Initial EA1 (early access) release
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x/12.x drivers
 - Supports all CUDA 11.x/12.x Toolkits
diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
new file mode 100644
index 00000000..1a047511
--- /dev/null
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -0,0 +1,11 @@
+# `cuda.core` Release notes
+
+Released on Nov <TODO>, 2024
+
+## Hightlights
+- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API.
+
+## Limitations
+
+-The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API.
+<TODO>
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
new file mode 100644
index 00000000..6011bf4f
--- /dev/null
+++ b/cuda_core/tests/test_linker.py
@@ -0,0 +1,101 @@
+import pytest
+
+from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._module import ObjectCode
+from cuda.core.experimental._program import Program
+
+ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
+empty_entrypoint_kernel = "__global__ void A() {}"
+empty_kernel = "__device__ void B() {}"
+addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
+
+
+@pytest.fixture(scope="module")
+def compile_ptx_functions(init_cuda):
+    object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx")
+    object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx")
+    object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx")
+
+    return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
+
+
+@pytest.fixture(scope="module")
+def compile_ltoir_functions(init_cuda):
+    object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",))
+
+    return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
+
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        LinkerOptions(arch=ARCH),
+        LinkerOptions(arch=ARCH, max_register_count=32),
+        LinkerOptions(arch=ARCH, time=True),
+        LinkerOptions(arch=ARCH, verbose=True),
+        LinkerOptions(arch=ARCH, optimization_level=3),
+        LinkerOptions(arch=ARCH, debug=True),
+        LinkerOptions(arch=ARCH, lineinfo=True),
+        LinkerOptions(arch=ARCH, ftz=True),
+        LinkerOptions(arch=ARCH, prec_div=True),
+        LinkerOptions(arch=ARCH, prec_sqrt=True),
+        LinkerOptions(arch=ARCH, fma=True),
+        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
+        LinkerOptions(arch=ARCH, variables_used=["var1"]),
+        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
+        LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        LinkerOptions(arch=ARCH, split_compile=0),
+        LinkerOptions(arch=ARCH, split_compile_extended=1),
+        LinkerOptions(arch=ARCH, jump_table_density=100),
+        LinkerOptions(arch=ARCH, no_cache=True),
+    ],
+)
+def test_linker_init(compile_ptx_functions, options):
+    linker = Linker(*compile_ptx_functions, options=options)
+    object_code = linker.link("cubin")
+    assert isinstance(object_code, ObjectCode)
+
+
+def test_linker_init_invalid_arch():
+    options = LinkerOptions(arch=None)
+    with pytest.raises(TypeError):
+        Linker(options)
+
+
+def test_linker_link_ptx(compile_ltoir_functions):
+    options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
+    linker = Linker(*compile_ltoir_functions, options=options)
+    linked_code = linker.link("ptx")
+    assert isinstance(linked_code, ObjectCode)
+
+
+def test_linker_link_cubin(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linked_code = linker.link("cubin")
+    assert isinstance(linked_code, ObjectCode)
+
+
+def test_linker_link_invalid_target_type(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    with pytest.raises(ValueError):
+        linker.link("invalid_target")
+
+
+def test_linker_get_error_log(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linker.link("cubin")
+    log = linker.get_error_log()
+    assert isinstance(log, str)
+
+
+def test_linker_get_info_log(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linker.link("cubin")
+    log = linker.get_info_log()
+    assert isinstance(log, str)

From 1b5f01974d92e2fef030ecc9e1da701ae221cd30 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:09:37 -0800
Subject: [PATCH 002/111] fix commit

---
 cuda_core/cuda/core/experimental/_linker.py | 26 ++++++++++-----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index e80bfe61..3a47b439 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -1,3 +1,4 @@
+import weakref
 from dataclasses import dataclass
 from typing import List, Optional
 
@@ -152,9 +153,7 @@ class LinkerOptions:
     xptxas: Optional[List[str]] = None
     split_compile: Optional[int] = None
     split_compile_extended: Optional[int] = None
-    jump_table_density: Optional[int] = None
     no_cache: Optional[bool] = None
-    device_stack_protector: Optional[bool] = None
 
     def __post_init__(self):
         self.formatted_options = []
@@ -199,26 +198,25 @@ def __post_init__(self):
             self.formatted_options.append(f"-split-compile={self.split_compile}")
         if self.split_compile_extended is not None:
             self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
-        if self.jump_table_density is not None:
-            self.formatted_options.append(f"-jump-table-density={self.jump_table_density}")
         if self.no_cache is not None:
             self.formatted_options.append("-no-cache")
-        if self.device_stack_protector is not None:
-            self.formatted_options.append("-device-stack-protector")
 
 
 class Linker:
-    __slots__ = "_handle"
+    __slots__ = ("__weakref__", "_handle", "_options")
 
     def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
-        self._handle = None
         options = check_or_create_options(LinkerOptions, options, "Linker options")
         self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
 
-        if object_codes is not None:
-            for code in object_codes:
-                assert isinstance(code, ObjectCode)
-                self._add_code_object(code)
+        if len(object_codes) == 0:
+            raise ValueError("At least one ObjectCode object must be provided")
+
+        for code in object_codes:
+            assert isinstance(code, ObjectCode)
+            self._add_code_object(code)
+
+        weakref.finalize(self, self.close)
 
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
@@ -233,7 +231,7 @@ def _add_code_object(self, object_code: ObjectCode):
 
     def link(self, target_type) -> ObjectCode:
         nvjitlink.complete(self._handle)
-        if target_type not in ["cubin", "ptx"]:
+        if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
         code = None
         if target_type == "cubin":
@@ -279,7 +277,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
     def handle(self) -> int:
         return self._handle
 
-    def __del__(self):
+    def close(self):
         if self._handle is not None:
             nvjitlink.destroy(self._handle)
             self._handle = None

From 58ce68f06841ebaae4bb6c4789c68fb8a16ec1e6 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:10:42 -0800
Subject: [PATCH 003/111] fix commit

---
 cuda_core/cuda/core/experimental/_linker.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 3a47b439..518c48d3 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -113,25 +113,11 @@ class LinkerOptions:
         Default: 1
         Acceptable value type: int
         Maps to: -split-compile-extended=<N>
-    jump_table_density : int, optional
-        When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to
-        determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value
-        is 101. The percentage ranges from 0 to 101 inclusively.
-        Default: 101
-        Acceptable value type: int
-        Maps to: -jump-table-density=<N>
     no_cache : bool, optional
         Do not cache the intermediate steps of nvJitLink.
         Default: False
         Acceptable value type: bool
         Maps to: -no-cache
-    device_stack_protector : bool, optional
-        Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory
-        safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in
-        each function. Only those functions which are deemed high-risk make use of a stack canary.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -device-stack-protector
     """
 
     arch: str

From ce8a47233786466d2e4d7335e518e0070dcf86ea Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:12:14 -0800
Subject: [PATCH 004/111] keep self._options for debugging

---
 cuda_core/cuda/core/experimental/_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 518c48d3..cf4c6ccd 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -192,7 +192,7 @@ class Linker:
     __slots__ = ("__weakref__", "_handle", "_options")
 
     def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
-        options = check_or_create_options(LinkerOptions, options, "Linker options")
+        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
         self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
 
         if len(object_codes) == 0:

From ab35b373ddda7b4177853d1c348a3b6027fb391f Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:13:41 -0800
Subject: [PATCH 005/111] revert release notes change

---
 cuda_core/docs/source/release/0.1.0-notes.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.md
index 1ebb41f9..2131ed90 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.md
@@ -1,9 +1,9 @@
 # `cuda.core` Release notes
 
-Released on Nov XX, 2024
+Released on Nov 8, 2024
 
 ## Hightlights
-- Initial EA1 (early access) release
+- Initial beta release
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x/12.x drivers
 - Supports all CUDA 11.x/12.x Toolkits

From b82591fc70adb26023ddaf1ddc0fb2e5c4881b4c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:14:31 -0800
Subject: [PATCH 006/111] update linker test

---
 cuda_core/tests/test_linker.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 6011bf4f..2dfac375 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -1,8 +1,7 @@
 import pytest
 
-from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental import Linker, LinkerOptions, Program
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._program import Program
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
 empty_entrypoint_kernel = "__global__ void A() {}"

From 265ba01c7ef586177afb877e0f2bbea42c80528d Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 13:26:20 -0800
Subject: [PATCH 007/111] update the test

---
 cuda_core/tests/test_linker.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 2dfac375..7db6ed9f 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -9,7 +9,7 @@
 addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
     object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx")
     object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx")
@@ -18,7 +18,7 @@ def compile_ptx_functions(init_cuda):
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
     object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",))
     object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
@@ -47,7 +47,6 @@ def compile_ltoir_functions(init_cuda):
         LinkerOptions(arch=ARCH, xptxas=["-v"]),
         LinkerOptions(arch=ARCH, split_compile=0),
         LinkerOptions(arch=ARCH, split_compile_extended=1),
-        LinkerOptions(arch=ARCH, jump_table_density=100),
         LinkerOptions(arch=ARCH, no_cache=True),
     ],
 )

From 7d5c742d1d9a2ac1676811002b1d511ff655dc57 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 27 Nov 2024 16:39:48 -0800
Subject: [PATCH 008/111] add the system class

---
 cuda_core/cuda/core/experimental/__init__.py |  1 +
 cuda_core/cuda/core/experimental/_system.py  | 66 ++++++++++++++++++++
 cuda_core/docs/source/api.rst                |  1 +
 cuda_core/docs/source/api_private.rst        |  1 +
 cuda_core/docs/source/release.md             |  2 +
 cuda_core/docs/source/release/0.2.0-notes.md | 10 +++
 cuda_core/tests/test_system.py               | 37 +++++++++++
 7 files changed, 118 insertions(+)
 create mode 100644 cuda_core/cuda/core/experimental/_system.py
 create mode 100644 cuda_core/docs/source/release/0.2.0-notes.md
 create mode 100644 cuda_core/tests/test_system.py

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 9b978398..25f5f82c 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -7,3 +7,4 @@
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
+from cuda.core.experimental._system import system
diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
new file mode 100644
index 00000000..58fbd6ae
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -0,0 +1,66 @@
+from typing import Tuple
+from cuda import cuda, cudart
+from cuda.core.experimental._device import Device
+from cuda.core.experimental._utils import handle_return
+
+class System:
+    """ Provide information about the cuda system.
+    This class is a singleton and should not be instantiated directly.
+    """
+
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(System, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        if hasattr(self, '_initialized') and self._initialized:
+            return
+        self._initialized = True
+
+    @property
+    def driver_version(self) -> Tuple[int, int]:
+        """
+        Query the CUDA driver version.
+
+        Returns
+        -------
+        tuple of int
+            A 2-tuple of (major, minor) version numbers.
+        """
+        version = handle_return(cuda.cuDriverGetVersion())
+        major = version // 1000
+        minor = (version % 1000) // 10
+        return (major, minor)
+
+    @property
+    def num_devices(self) -> int:
+        """
+        Query the number of available GPUs.
+
+        Returns
+        -------
+        int
+            The number of available GPU devices.
+        """
+        return handle_return(cudart.cudaGetDeviceCount())
+
+    @property
+    def devices(self) -> tuple:
+        """
+        Query the available device instances.
+
+        Returns
+        -------
+        tuple of Device
+            A tuple containing instances of available devices.
+        """
+        total = self.num_devices
+        return tuple(Device(device_id) for device_id in range(total))
+
+system = System()
+system.__doc__ = """
+Singleton instance of the :obj:`~cuda.core.experimental._system.System` class.
+"""
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 1cb9811b..3d2a8481 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -16,6 +16,7 @@ CUDA runtime
 
    Device
    launch
+   system
 
    :template: dataclass.rst
 
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index f100eb7c..a833d69c 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -16,6 +16,7 @@ CUDA runtime
    _memory.Buffer
    _stream.Stream
    _event.Event
+   _system.System
 
 
 CUDA compilation toolchain
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e24786..5cbaa7f2 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -6,4 +6,6 @@ maxdepth: 3
 ---
 
     0.1.0 <release/0.1.0-notes>
+    0.2.0 <release/0.2.0-notes>
+
 ```
diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
new file mode 100644
index 00000000..e1a3c4ec
--- /dev/null
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -0,0 +1,10 @@
+# `cuda.core` Release notes
+
+Released on <TODO>, 2024
+
+## Hightlights
+- Addition of the system singleton
+
+## Limitations
+
+<TODO>
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
new file mode 100644
index 00000000..548e8685
--- /dev/null
+++ b/cuda_core/tests/test_system.py
@@ -0,0 +1,37 @@
+# test_System.py
+
+try:
+    from cuda.bindings import driver, runtime
+except ImportError:
+    from cuda import cuda as driver
+    from cuda import cudart as runtime
+
+from cuda.core.experimental import Device, System
+
+from cuda.core.experimental import Device
+from cuda.core.experimental._utils import handle_return
+
+def test_System_singleton():
+    System1 = System
+    System2 = System
+    assert System1 is System2, "System is not a singleton"
+
+def test_driver_version():
+    driver_version = System.driver_version
+    print(driver_version)
+    version = handle_return(driver.cuDriverGetVersion())
+    expected_driver_version = (version // 1000, (version % 1000) // 10)
+    assert driver_version == expected_driver_version, "Driver version does not match expected value"
+
+def test_num_devices():
+    num_devices = System.num_devices
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    assert num_devices == expected_num_devices, "Number of devices does not match expected value"
+
+def test_devices():
+    devices = System.devices
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
+    assert len(devices) == len(expected_devices), "Number of devices does not match expected value"
+    for device, expected_device in zip(devices, expected_devices):
+        assert device.device_id == expected_device.device_id, "Device ID does not match expected value"

From 4c4acef6f840ebce13dcf41317f447d448420ae6 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 28 Nov 2024 16:48:59 -0800
Subject: [PATCH 009/111] fix old test change

---
 cuda_core/tests/test_system.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index 548e8685..400d5e22 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -1,35 +1,33 @@
-# test_System.py
-
 try:
     from cuda.bindings import driver, runtime
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
 
-from cuda.core.experimental import Device, System
+from cuda.core.experimental import Device, system
 
 from cuda.core.experimental import Device
 from cuda.core.experimental._utils import handle_return
 
-def test_System_singleton():
-    System1 = System
-    System2 = System
-    assert System1 is System2, "System is not a singleton"
+def test_system_singleton():
+    system1 = system
+    system2 = system
+    assert system1 is system2, "system is not a singleton"
 
 def test_driver_version():
-    driver_version = System.driver_version
+    driver_version = system.driver_version
     print(driver_version)
     version = handle_return(driver.cuDriverGetVersion())
     expected_driver_version = (version // 1000, (version % 1000) // 10)
     assert driver_version == expected_driver_version, "Driver version does not match expected value"
 
 def test_num_devices():
-    num_devices = System.num_devices
+    num_devices = system.num_devices
     expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
     assert num_devices == expected_num_devices, "Number of devices does not match expected value"
 
 def test_devices():
-    devices = System.devices
+    devices = system.devices
     expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
     expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
     assert len(devices) == len(expected_devices), "Number of devices does not match expected value"

From 36f045c6a1a834fc28f8652d348ac281ca827a15 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 28 Nov 2024 16:50:18 -0800
Subject: [PATCH 010/111] run ruff manually

---
 cuda_core/cuda/core/experimental/_system.py | 4 +++-
 cuda_core/tests/test_system.py              | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
index 58fbd6ae..c1ce9402 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -1,8 +1,10 @@
 from typing import Tuple
+
 from cuda import cuda, cudart
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._utils import handle_return
 
+
 class System:
     """ Provide information about the cuda system.
     This class is a singleton and should not be instantiated directly.
@@ -12,7 +14,7 @@ class System:
 
     def __new__(cls):
         if cls._instance is None:
-            cls._instance = super(System, cls).__new__(cls)
+            cls._instance = super().__new__(cls)
         return cls._instance
 
     def __init__(self):
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index 400d5e22..a093dc94 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -5,10 +5,9 @@
     from cuda import cudart as runtime
 
 from cuda.core.experimental import Device, system
-
-from cuda.core.experimental import Device
 from cuda.core.experimental._utils import handle_return
 
+
 def test_system_singleton():
     system1 = system
     system2 = system

From 319a372b75b0530e7f4600bbdc34197db2bf420c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:17:00 -0800
Subject: [PATCH 011/111] merge with main for ruff

---
 cuda_core/cuda/core/experimental/_device.py | 12 +++++--
 cuda_core/cuda/core/experimental/_memory.py | 37 ++++++++++++++++++++-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 0c03c789..a5cd4bc7 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -62,7 +62,13 @@ def __new__(cls, device_id=None):
                 for dev_id in range(total):
                     dev = super().__new__(cls)
                     dev._id = dev_id
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    # If the device is in TCC mode, or does not support memory pools for some other reason,
+                    # use the AsyncMemoryResource which does not use memory pools.
+                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
+                        dev._mr = _AsyncMemoryResource(dev_id)
+                    else:
+                        dev._mr = _DefaultAsyncMempool(dev_id)
+
                     dev._has_inited = False
                     _tls.devices.append(dev)
 
@@ -70,7 +76,7 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
+            raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 415b5151..50f8a260 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -42,7 +42,11 @@ class Buffer:
     """
 
     # TODO: handle ownership? (_mr could be None)
-    __slots__ = ("_ptr", "_size", "_mr")
+    __slots__ = (
+        "_ptr",
+        "_size",
+        "_mr",
+    )
 
     def __init__(self, ptr, size, mr: MemoryResource = None):
         self._ptr = ptr
@@ -286,3 +290,34 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class _AsyncMemoryResource(MemoryResource):
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, dev_id):
+        self._handle = None
+        self._dev_id = dev_id
+
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            stream = default_stream()
+        ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id

From 19e3a4f4b54a4b9562742d8575ad1f8ca7e6e0a7 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:18:04 -0800
Subject: [PATCH 012/111] fix tuple reformat

---
 cuda_core/cuda/core/experimental/_memory.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 50f8a260..26a9dd82 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -42,11 +42,7 @@ class Buffer:
     """
 
     # TODO: handle ownership? (_mr could be None)
-    __slots__ = (
-        "_ptr",
-        "_size",
-        "_mr",
-    )
+    __slots__ = ("_ptr", "_size", "_mr")
 
     def __init__(self, ptr, size, mr: MemoryResource = None):
         self._ptr = ptr

From 5e84da7cf888214ba940176a28089467f2afb055 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:18:45 -0800
Subject: [PATCH 013/111] fix tuple reformat

---
 cuda_core/cuda/core/experimental/_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index a5cd4bc7..a15eef36 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -76,7 +76,7 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?")
+            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:

From 122d25c01f4b8bbc02239bb1c2e58005c4bdb506 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:25:39 -0800
Subject: [PATCH 014/111] switch to sync alloc and free

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++---
 cuda_core/cuda/core/experimental/_memory.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index a15eef36..889c20a0 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -63,9 +63,9 @@ def __new__(cls, device_id=None):
                     dev = super().__new__(cls)
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the AsyncMemoryResource which does not use memory pools.
+                    # use the SynchronousMemoryResource which does not use memory pools.
                     if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
-                        dev._mr = _AsyncMemoryResource(dev_id)
+                        dev._mr = _SynchronousMemoryResource(dev_id)
                     else:
                         dev._mr = _DefaultAsyncMempool(dev_id)
 
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 26a9dd82..16dd97d7 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -288,7 +288,7 @@ def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
 
 
-class _AsyncMemoryResource(MemoryResource):
+class _SynchronousMemoryResource(MemoryResource):
     __slots__ = ("_dev_id",)
 
     def __init__(self, dev_id):
@@ -298,13 +298,13 @@ def __init__(self, dev_id):
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             stream = default_stream()
-        ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle))
+        ptr = handle_return(cuda.cuMemAlloc(size, stream._handle))
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
         if stream is None:
             stream = default_stream()
-        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+        handle_return(cuda.cuMemFree(ptr, stream._handle))
 
     @property
     def is_device_accessible(self) -> bool:

From 5f8ff802ee9efba50492870410d14d8633471cde Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:28:56 -0800
Subject: [PATCH 015/111] Update cuda_core/docs/source/release/0.2.0-notes.md

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/docs/source/release/0.2.0-notes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
index e1a3c4ec..57a3254d 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ b/cuda_core/docs/source/release/0.2.0-notes.md
@@ -3,7 +3,7 @@
 Released on <TODO>, 2024
 
 ## Hightlights
-- Addition of the system singleton
+- Add a `cuda.core.experimental.system` module for querying system- or process- wide information.
 
 ## Limitations
 

From d1d6928d6be107087f534a7dc37bf9c8dbdc9463 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:44:28 -0800
Subject: [PATCH 016/111] Update cuda_core/docs/source/release.md

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/docs/source/release.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 5cbaa7f2..8c810273 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -6,6 +6,6 @@ maxdepth: 3
 ---
 
     0.1.0 <release/0.1.0-notes>
-    0.2.0 <release/0.2.0-notes>
+    0.1.1 <release/0.1.1-notes>
 
 ```

From cfa9d167bcd39504fa8a6963f1fafd836cda2623 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:56:23 -0800
Subject: [PATCH 017/111] address comments

---
 cuda_core/docs/source/release/0.2.0-notes.md | 10 ----------
 cuda_core/tests/test_system.py               |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 cuda_core/docs/source/release/0.2.0-notes.md

diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.2.0-notes.md
deleted file mode 100644
index e1a3c4ec..00000000
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# `cuda.core` Release notes
-
-Released on <TODO>, 2024
-
-## Hightlights
-- Addition of the system singleton
-
-## Limitations
-
-<TODO>
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index a093dc94..893d1206 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -11,7 +11,7 @@
 def test_system_singleton():
     system1 = system
     system2 = system
-    assert system1 is system2, "system is not a singleton"
+    assert id(system1) == id(system2), "system is not a singleton"
 
 def test_driver_version():
     driver_version = system.driver_version

From 8e43cd26b30d0b34526260c5cd60bdadeecb3e4d Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:57:16 -0800
Subject: [PATCH 018/111] rename release file

---
 cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md

diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
new file mode 100644
index 00000000..404ecb85
--- /dev/null
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -0,0 +1,7 @@
+# `cuda.core` Release notes
+
+Released on <TODO>, 2024
+
+## Hightlights
+- Add a `cuda.core.experimental.system` module for querying system- or process- wide information.
+

From bff2627fa70a446c337fc987d8165f78987feae9 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 10:33:30 -0800
Subject: [PATCH 019/111] update link style to match other PR

---
 cuda_core/cuda/core/experimental/_system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
index c1ce9402..2cecbd98 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -64,5 +64,5 @@ def devices(self) -> tuple:
 
 system = System()
 system.__doc__ = """
-Singleton instance of the :obj:`~cuda.core.experimental._system.System` class.
+Singleton instance of the :obj:`_system.System` class.
 """

From c8a8dcb0a682ab754e1d036c68dc312a0b97608d Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 13:24:50 -0800
Subject: [PATCH 020/111] save

---
 cuda_core/cuda/core/experimental/_linker.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index cf4c6ccd..d7dd273c 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import weakref
 from dataclasses import dataclass
 from typing import List, Optional

From 5e3bdcd97e6ccc13513f15849f106addab9fb72a Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 13:26:15 -0800
Subject: [PATCH 021/111] add copyright header

---
 cuda_core/cuda/core/experimental/_system.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
index 2cecbd98..258f9bcd 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from typing import Tuple
 
 from cuda import cuda, cudart

From e9661895fff2a5d928be73b521aee30e4960935e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 13:33:31 -0800
Subject: [PATCH 022/111] add docstring, copyright header, and switch finalizer
 pattern

---
 cuda_core/cuda/core/experimental/_linker.py | 68 ++++++++++++++++-----
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index d7dd273c..1a99f355 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -193,11 +193,49 @@ def __post_init__(self):
 
 
 class Linker:
-    __slots__ = ("__weakref__", "_handle", "_options")
+    """
+    Linker class for managing the linking of object codes with specified options.
+
+    Parameters
+    ----------
+    object_codes : ObjectCode
+        One or more ObjectCode objects to be linked.
+    options : LinkerOptions, optional
+        Options for the linker. If not provided, default options will be used.
+
+    Attributes
+    ----------
+    _options : LinkerOptions
+        The options used for the linker.
+    _handle : handle
+        The handle to the linker created by nvjitlink.
+
+    Methods
+    -------
+    _add_code_object(object_code)
+        Adds an object code to the linker.
+    close()
+        Closes the linker and releases resources.
+    """
+
+    class _MembersNeededForFinalize:
+        __slots__ = ("handle",)
+
+        def __init__(self, program_obj, handle):
+            self.handle = handle
+            weakref.finalize(program_obj, self.close)
+
+        def close(self):
+            if self.handle is not None:
+                nvjitlink.destroy(self.handle)
+                self.handle = None
+
+    __slots__ = ("__weakref__", "_mnff", "_options")
 
     def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
         self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
+        self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
+        self._mnff = Linker._MembersNeededForFinalize(self, None)
 
         if len(object_codes) == 0:
             raise ValueError("At least one ObjectCode object must be provided")
@@ -212,7 +250,7 @@ def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
         nvjitlink.add_data(
-            self._handle,
+            self._mnff.handle,
             self._input_type_from_code_type(object_code._code_type),
             data,
             len(data),
@@ -220,31 +258,31 @@ def _add_code_object(self, object_code: ObjectCode):
         )
 
     def link(self, target_type) -> ObjectCode:
-        nvjitlink.complete(self._handle)
+        nvjitlink.complete(self._mnff.handle)
         if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
         code = None
         if target_type == "cubin":
-            cubin_size = nvjitlink.get_linked_cubin_size(self._handle)
+            cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle)
             code = bytearray(cubin_size)
-            nvjitlink.get_linked_cubin(self._handle, code)
+            nvjitlink.get_linked_cubin(self._mnff.handle, code)
         else:
-            ptx_size = nvjitlink.get_linked_ptx_size(self._handle)
+            ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle)
             code = bytearray(ptx_size)
-            nvjitlink.get_linked_ptx(self._handle, code)
+            nvjitlink.get_linked_ptx(self._mnff.handle, code)
 
         return ObjectCode(bytes(code), target_type)
 
     def get_error_log(self) -> str:
-        log_size = nvjitlink.get_error_log_size(self._handle)
+        log_size = nvjitlink.get_error_log_size(self._mnff.handle)
         log = bytearray(log_size)
-        nvjitlink.get_error_log(self._handle, log)
+        nvjitlink.get_error_log(self._mnff.handle, log)
         return log.decode()
 
     def get_info_log(self) -> str:
-        log_size = nvjitlink.get_info_log_size(self._handle)
+        log_size = nvjitlink.get_info_log_size(self._mnff.handle)
         log = bytearray(log_size)
-        nvjitlink.get_info_log(self._handle, log)
+        nvjitlink.get_info_log(self._mnff.handle, log)
         return log.decode()
 
     def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
@@ -265,9 +303,7 @@ def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
 
     @property
     def handle(self) -> int:
-        return self._handle
+        return self._mnff.handle
 
     def close(self):
-        if self._handle is not None:
-            nvjitlink.destroy(self._handle)
-            self._handle = None
+        self._mnff.close()

From c626b956bc7ca1cc963b89bceafc3dfc3b0f84aa Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:03:30 +0530
Subject: [PATCH 023/111] Adding support for CI testing.

---
 .github/actions/build/action.yml           |  2 +-
 .github/actions/test/action.yml            | 37 ++++++++++
 .github/workflows/gh-build-and-test.yml    | 23 ++++++-
 .github/workflows/gh-test.yml              | 80 ++++++++++++++++++++++
 continuous_integration/scripts/setup-utils | 23 +++++++
 continuous_integration/scripts/test        | 33 +++++++++
 6 files changed, 195 insertions(+), 3 deletions(-)
 create mode 100644 .github/actions/test/action.yml
 create mode 100644 .github/workflows/gh-test.yml
 create mode 100755 continuous_integration/scripts/test

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 952fb9cd..b6741343 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -54,7 +54,7 @@ runs:
           --rm "${{ inputs.docker-image }}" \
           /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
 
-    - if: ${{ !inputs.use-container }}
+    - if: ${{ !inputs.use-container && steps.cache-build.outputs.cache-hit != 'true'}}
       name: Build (without container)
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
new file mode 100644
index 00000000..a11a9938
--- /dev/null
+++ b/.github/actions/test/action.yml
@@ -0,0 +1,37 @@
+name: test
+
+description: Run tests in specified project
+
+inputs:
+  test-options:
+    required: true
+    type: string
+  runner-has-gpu:
+    required: true
+    type: boolean
+    description: "The runner has GPU(s)."
+
+runs:
+  using: composite
+  steps:
+    - if: ${{ inputs.runner-has-gpu == true }}
+      name: Run nvidia-smi to make sure GPU is working
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: nvidia-smi
+
+    - name: Download build artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: ${{ env.ARTIFACT_NAME }}
+        path: ${{ env.ARTIFACTS_DIR }}
+
+    - name: Display structure of downloaded artifacts
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        pwd
+        ls -lahR $ARTIFACTS_DIR
+
+    - name: Run test / analysis
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/test" ${{ inputs.test-options }}
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 430fbf5b..1df308ab 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -27,9 +27,28 @@ jobs:
     with:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu16') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      build-type: ${{ inputs.build-type }}
+      use-container: false
+      host-platform: ${{ inputs.host-platform }}
+      dependencies-file: ""
+      build-mode: ${{ inputs.build-mode }}
+      upload-enabled: ${{ inputs.upload-enabled }}
+    secrets: inherit
+
+  test:
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs:
+      - build
+    uses:
+      ./.github/workflows/gh-test.yml
+    with:
+      client-repo: ${{ github.event.repository.name }}
+      target-device: ${{ inputs.target-device }}
+      test-options: ${{ inputs.build-type }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
       build-type: ${{ inputs.build-type }}
-      use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}
diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
new file mode 100644
index 00000000..74f1c520
--- /dev/null
+++ b/.github/workflows/gh-test.yml
@@ -0,0 +1,80 @@
+name: Test
+
+on:
+  workflow_call:
+    inputs:
+      client-repo:
+        required: true
+        type: string
+      target-device:
+        required: true
+        type: string
+      test-options:
+        required: true
+        type: string
+      runs-on:
+        required: true
+        type: string
+      runner-has-gpu:
+        required: true
+        type: boolean
+        description: "The runner has GPU(s)."
+      build-type:
+        required: true
+        type: string
+        description: One of ci / release
+      host-platform:
+        required: true
+        type: string
+      dependencies-file:
+        required: true
+        type: string
+        description: path to versions.json relative to the target repo dir
+      build-mode:
+        required: true
+        type: string
+      upload-enabled:
+        required: true
+        type: boolean
+      python-version:
+        required: false
+        type: string
+
+jobs:
+  build:
+    name: Test (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} )
+
+    permissions:
+      id-token: write # This is required for configure-aws-credentials
+      contents: read  # This is required for actions/checkout
+
+    runs-on: ${{ inputs.runs-on }}
+
+    container:
+      options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
+      image: condaforge/miniforge3:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+
+    steps:
+      - name: Checkout ${{ inputs.client-repo }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup
+        uses: ./.github/actions/setup
+        with:
+          client-repo: ${{ inputs.client-repo }}
+          build-type: ${{ inputs.build-type }}
+          target-device: "${{ inputs.target-device }}"
+          host-platform: ${{ inputs.host-platform }}
+          build-mode: ${{ inputs.build-mode }}
+          upload-enabled: ${{ inputs.upload-enabled }}
+          python-version: ${{ inputs.python-version }}
+
+      - name: Call test action
+        uses: ./.github/actions/test
+        with:
+          test-options: ${{ inputs.test-options }}
+          runner-has-gpu: ${{ inputs.runner-has-gpu }}
diff --git a/continuous_integration/scripts/setup-utils b/continuous_integration/scripts/setup-utils
index 62579e63..f8faefa4 100755
--- a/continuous_integration/scripts/setup-utils
+++ b/continuous_integration/scripts/setup-utils
@@ -151,6 +151,29 @@ init_build_env() {
 
     make-conda-env "$BUILD_TYPE";
 
+    activate_conda_env;
+    conda_info;
+}
+
+init_test_env() {
+    set -x;
+
+    . conda-utils;
+
+    export TEST_TYPE=$1
+
+    set -xeuo pipefail;
+
+    set_base_defs;
+
+    cd "$PREBUILD_DIR"
+
+    # setup_test_env;
+
+    cd "$REPO_DIR";
+
+    make-conda-env "$TEST_TYPE";
+
     activate_conda_env;
     conda_info;
 }
\ No newline at end of file
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
new file mode 100755
index 00000000..e8c56c52
--- /dev/null
+++ b/continuous_integration/scripts/test
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+test_ci() {
+    set -xeou pipefail
+
+    cd "${ARTIFACTS_DIR}"
+
+    activate_conda_env;
+
+    pip install *.whl
+
+    cd "${REPO_DIR}"
+
+    python -m pytest
+}
+
+test_project() {
+    set -xeou pipefail
+
+    export PYTHONUNBUFFERED=1
+
+    . setup-utils;
+    init_test_env "$@";
+
+    git config --global --add safe.directory "$REPO_DIR/.git"
+
+    case "${TEST_TYPE}" in
+        ci) test_ci;;
+        *) return 1;;
+    esac
+}
+
+(test_project "$@");

From 5467b5284c3467f8f3a41570a7df108049389f42 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Wed, 27 Nov 2024 04:18:20 +0530
Subject: [PATCH 024/111] Supply python-version.

---
 .github/workflows/gh-build-and-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 1df308ab..adf8477a 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -34,6 +34,7 @@ jobs:
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}
       upload-enabled: ${{ inputs.upload-enabled }}
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
 
   test:

From c78ebfdcaa92ea40ade3014a3da952e5a11dc8e6 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Wed, 27 Nov 2024 04:36:17 +0530
Subject: [PATCH 025/111] Update test driver to test bindings and core
 separately.

---
 .github/actions/test/action.yml     | 22 +++++++++++++++++-----
 continuous_integration/scripts/test | 13 +++++++++----
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index a11a9938..018db9aa 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -19,17 +19,29 @@ runs:
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: nvidia-smi
 
-    - name: Download build artifacts
+    - name: Download bindings build artifacts
       uses: actions/download-artifact@v4
       with:
-        name: ${{ env.ARTIFACT_NAME }}
-        path: ${{ env.ARTIFACTS_DIR }}
+        name: ${{ env.BINDINGS_ARTIFACT_NAME }}
+        path: ${{ env.BINDINGS_ARTIFACTS_DIR }}
 
-    - name: Display structure of downloaded artifacts
+    - name: Display structure of downloaded bindings artifacts
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         pwd
-        ls -lahR $ARTIFACTS_DIR
+        ls -lahR $BINDINGS_ARTIFACTS_DIR
+
+    - name: Download core build artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: ${{ env.CORE_ARTIFACT_NAME }}
+        path: ${{ env.CORE_ARTIFACTS_DIR }}
+
+    - name: Display structure of downloaded core build artifacts
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        pwd
+        ls -lahR $CORE_ARTIFACTS_DIR
 
     - name: Run test / analysis
       shell: bash --noprofile --norc -xeuo pipefail {0}
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
index e8c56c52..96bdf8d5 100755
--- a/continuous_integration/scripts/test
+++ b/continuous_integration/scripts/test
@@ -3,15 +3,20 @@
 test_ci() {
     set -xeou pipefail
 
-    cd "${ARTIFACTS_DIR}"
-
     activate_conda_env;
 
+    cd "${BINDINGS_ARTIFACTS_DIR}"
+    pip install *.whl
+
+    cd "${CORE_ARTIFACTS_DIR}"
     pip install *.whl
 
-    cd "${REPO_DIR}"
+    cd "${REPO_DIR}/cuda_python/cuda_bindings"
+    python -m pytest tests/
+
+    cd "${REPO_DIR}/cuda_python/cuda_core"
+    python -m pytest tests/
 
-    python -m pytest
 }
 
 test_project() {

From e5bf104ddf6ec94ec36ed74b68d148003fe8b6da Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:03:30 +0530
Subject: [PATCH 026/111] Adding support for CI testing.

---
 .github/workflows/gh-build-and-test.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index adf8477a..acf7e509 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -56,3 +56,21 @@ jobs:
       upload-enabled: ${{ inputs.upload-enabled }}
       python-version: ${{ inputs.python-version }}
     secrets: inherit
+
+  test:
+    if: ${{ github.repository_owner == 'nvidia' }}
+    uses:
+      ./.github/workflows/gh-build.yml
+    with:
+      client-repo: ${{ github.event.repository.name }}
+      target-device: ${{ inputs.target-device }}
+      test-options: ${{ inputs.build-type }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
+      build-type: ${{ inputs.build-type }}
+      use-container: false
+      host-platform: ${{ inputs.host-platform }}
+      dependencies-file: ""
+      build-mode: ${{ inputs.build-mode }}
+      upload-enabled: ${{ inputs.upload-enabled }}
+    secrets: inherit

From 360e1b2d23f064eec19e3cd0c87d5bd823a41901 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:05:07 +0530
Subject: [PATCH 027/111] Adding support for CI testing. Attempt 2.

---
 .github/workflows/gh-build-and-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index acf7e509..1d9bb4ea 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -60,7 +60,7 @@ jobs:
   test:
     if: ${{ github.repository_owner == 'nvidia' }}
     uses:
-      ./.github/workflows/gh-build.yml
+      ./.github/workflows/gh-test.yml
     with:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}

From 67b7aed7ad1efbbbf017c221a5ef8223bed0c032 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:06:10 +0530
Subject: [PATCH 028/111] Adding support for CI testing. Attempt 3.

---
 .github/workflows/gh-build-and-test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 1d9bb4ea..65a4a72a 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -59,6 +59,8 @@ jobs:
 
   test:
     if: ${{ github.repository_owner == 'nvidia' }}
+    needs:
+      - build
     uses:
       ./.github/workflows/gh-test.yml
     with:

From 6fab977584c4f4a5a5cf2f1f1cef3719fe8ed4d5 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:15:07 +0530
Subject: [PATCH 029/111] Use container for tests on the GPU runner.

---
 .github/workflows/gh-build-and-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 65a4a72a..167a5546 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -70,7 +70,7 @@ jobs:
       runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
       runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
       build-type: ${{ inputs.build-type }}
-      use-container: false
+      use-container: ${{ inputs.host-platform == 'linux-x64' }}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}

From f2a0939aadf87d92c59e84584ded24f6a77b077a Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Mon, 23 Sep 2024 18:27:47 +0530
Subject: [PATCH 030/111] Use container for tests on the GPU runner. Attempt 2.

---
 .github/workflows/gh-build-and-test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 167a5546..185ede7a 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -70,7 +70,6 @@ jobs:
       runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
       runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
       build-type: ${{ inputs.build-type }}
-      use-container: ${{ inputs.host-platform == 'linux-x64' }}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}

From 508a83c072b2ec46750d174926d32684a3207092 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 15 Nov 2024 20:23:05 +0530
Subject: [PATCH 031/111] Remove build caching.

---
 .github/actions/build/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index b6741343..952fb9cd 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -54,7 +54,7 @@ runs:
           --rm "${{ inputs.docker-image }}" \
           /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
 
-    - if: ${{ !inputs.use-container && steps.cache-build.outputs.cache-hit != 'true'}}
+    - if: ${{ !inputs.use-container }}
       name: Build (without container)
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |

From 72062aa1ef77f6c76e6f6be8ac1bfa480d9abe4b Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 15 Nov 2024 20:32:58 +0530
Subject: [PATCH 032/111] Hard select Build (without container).

---
 .github/actions/build/action.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 952fb9cd..583f5775 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -54,10 +54,11 @@ runs:
           --rm "${{ inputs.docker-image }}" \
           /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
 
-    - if: ${{ !inputs.use-container }}
-      name: Build (without container)
+    #- if: ${{ inputs.use-container == false }}
+    - name: Build (without container)
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
+        echo "inputs.use-container=${{ inputs.use-container }}"
         "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/build" "${{ inputs.build-type}}" "${{ inputs.target-device }}"
 
     - name: Display structure of the bindings artifacts folder (post build)

From 32ca908e36e261008ad8cec93b094e0db3b5a8cd Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 15 Nov 2024 20:41:03 +0530
Subject: [PATCH 033/111] Use container with preinstalled conda  for build.

---
 .github/workflows/gh-build-and-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 185ede7a..e6e40624 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -29,7 +29,7 @@ jobs:
       target-device: ${{ inputs.target-device }}
       runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
       build-type: ${{ inputs.build-type }}
-      use-container: false
+      use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}

From 970a8e5c4b43be8c6331904993674487b039c3eb Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 15 Nov 2024 20:42:04 +0530
Subject: [PATCH 034/111] Use container with preinstalled conda for build.
 Attempt 2.

---
 .github/actions/build/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 583f5775..7a09ed14 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -54,8 +54,8 @@ runs:
           --rm "${{ inputs.docker-image }}" \
           /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
 
-    #- if: ${{ inputs.use-container == false }}
-    - name: Build (without container)
+    - if: ${{ inputs.use-container == false }}
+      name: Build (without container)
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         echo "inputs.use-container=${{ inputs.use-container }}"

From be969e595ad6b66d16f9d32e96f6df550890bd70 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 15 Nov 2024 20:44:07 +0530
Subject: [PATCH 035/111] Use container with preinstalled conda for build.
 Attempt 3.

---
 .github/actions/build/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 7a09ed14..e5f67202 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -54,7 +54,7 @@ runs:
           --rm "${{ inputs.docker-image }}" \
           /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
 
-    - if: ${{ inputs.use-container == false }}
+    - if: ${{ !inputs.use-container }}
       name: Build (without container)
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |

From 3382d68b42b17b39535dcbe3b5e58e54b4695f11 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 29 Nov 2024 09:12:32 +0530
Subject: [PATCH 036/111] Updated paths.

---
 continuous_integration/scripts/test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
index 96bdf8d5..cbee6998 100755
--- a/continuous_integration/scripts/test
+++ b/continuous_integration/scripts/test
@@ -11,10 +11,10 @@ test_ci() {
     cd "${CORE_ARTIFACTS_DIR}"
     pip install *.whl
 
-    cd "${REPO_DIR}/cuda_python/cuda_bindings"
+    cd "${REPO_DIR}/cuda_bindings"
     python -m pytest tests/
 
-    cd "${REPO_DIR}/cuda_python/cuda_core"
+    cd "${REPO_DIR}/cuda_core"
     python -m pytest tests/
 
 }

From a9ed0c6a038bbcb605f5016df584aff081d378cf Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Fri, 29 Nov 2024 09:16:00 +0530
Subject: [PATCH 037/111] Removed duplicate tests section.

---
 .github/workflows/gh-build-and-test.yml | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index e6e40624..9b414a22 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -56,22 +56,3 @@ jobs:
       upload-enabled: ${{ inputs.upload-enabled }}
       python-version: ${{ inputs.python-version }}
     secrets: inherit
-
-  test:
-    if: ${{ github.repository_owner == 'nvidia' }}
-    needs:
-      - build
-    uses:
-      ./.github/workflows/gh-test.yml
-    with:
-      client-repo: ${{ github.event.repository.name }}
-      target-device: ${{ inputs.target-device }}
-      test-options: ${{ inputs.build-type }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
-      runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
-      build-type: ${{ inputs.build-type }}
-      host-platform: ${{ inputs.host-platform }}
-      dependencies-file: ""
-      build-mode: ${{ inputs.build-mode }}
-      upload-enabled: ${{ inputs.upload-enabled }}
-    secrets: inherit

From 17c3e106ef82d8a5dcc8bae0c2c8ea484ccc2dda Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 09:15:46 -0800
Subject: [PATCH 038/111] address comments

---
 cuda_core/cuda/core/experimental/_linker.py | 52 +++++++++++----------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 1a99f355..bb66adde 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -234,8 +234,9 @@ def close(self):
 
     def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
         self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        self._mnff.handle = nvjitlink.create(len(options.formatted_options), options.formatted_options)
-        self._mnff = Linker._MembersNeededForFinalize(self, None)
+        self._mnff = Linker._MembersNeededForFinalize(
+            self, nvjitlink.create(len(options.formatted_options), options.formatted_options)
+        )
 
         if len(object_codes) == 0:
             raise ValueError("At least one ObjectCode object must be provided")
@@ -244,8 +245,6 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
             assert isinstance(code, ObjectCode)
             self._add_code_object(code)
 
-        weakref.finalize(self, self.close)
-
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
@@ -257,19 +256,21 @@ def _add_code_object(self, object_code: ObjectCode):
             f"{object_code._handle}_{object_code._code_type}",
         )
 
+    _get_linked_methods = {
+        "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin),
+        "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx),
+    }
+
     def link(self, target_type) -> ObjectCode:
         nvjitlink.complete(self._mnff.handle)
-        if target_type not in ("cubin", "ptx"):
+        get_linked = self._get_linked_methods.get(target_type)
+        if get_linked is None:
             raise ValueError(f"Unsupported target type: {target_type}")
-        code = None
-        if target_type == "cubin":
-            cubin_size = nvjitlink.get_linked_cubin_size(self._mnff.handle)
-            code = bytearray(cubin_size)
-            nvjitlink.get_linked_cubin(self._mnff.handle, code)
-        else:
-            ptx_size = nvjitlink.get_linked_ptx_size(self._mnff.handle)
-            code = bytearray(ptx_size)
-            nvjitlink.get_linked_ptx(self._mnff.handle, code)
+
+        get_size, get_code = get_linked
+        size = get_size(self._mnff.handle)
+        code = bytearray(size)
+        get_code(self._mnff.handle, code)
 
         return ObjectCode(bytes(code), target_type)
 
@@ -285,21 +286,22 @@ def get_info_log(self) -> str:
         nvjitlink.get_info_log(self._mnff.handle, log)
         return log.decode()
 
+    _input_types = {
+        "ptx": nvjitlink.InputType.PTX,
+        "cubin": nvjitlink.InputType.CUBIN,
+        "fatbin": nvjitlink.InputType.FATBIN,
+        "ltoir": nvjitlink.InputType.LTOIR,
+        "object": nvjitlink.InputType.OBJECT,
+    }
+
     def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
         # this list is based on the supported values for code_type in the ObjectCode class definition.
         # nvjitlink supports other options for input type
-        if code_type == "ptx":
-            return nvjitlink.InputType.PTX
-        elif code_type == "cubin":
-            return nvjitlink.InputType.CUBIN
-        elif code_type == "fatbin":
-            return nvjitlink.InputType.FATBIN
-        elif code_type == "ltoir":
-            return nvjitlink.InputType.LTOIR
-        elif code_type == "object":
-            return nvjitlink.InputType.OBJECT
-        else:
+        input_type = self._input_types.get(code_type)
+
+        if input_type is None:
             raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
+        return input_type
 
     @property
     def handle(self) -> int:

From 7f846263d9feffe601948eb0b82b3668b6855713 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 09:18:44 -0800
Subject: [PATCH 039/111] rename release notes

---
 cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} | 1 -
 1 file changed, 1 deletion(-)
 rename cuda_core/docs/source/release/{0.2.0-notes.md => 0.1.1-notes.md} (93%)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
similarity index 93%
rename from cuda_core/docs/source/release/0.2.0-notes.md
rename to cuda_core/docs/source/release/0.1.1-notes.md
index 1a047511..0dbd49ce 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -8,4 +8,3 @@ Released on Nov <TODO>, 2024
 ## Limitations
 
 -The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API.
-<TODO>

From 5207558076d366abf483e72daedc7fd6dce378e6 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 09:42:12 -0800
Subject: [PATCH 040/111] rename release notes

---
 cuda_core/docs/source/release.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 4c615eb3..55090b0b 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -5,6 +5,6 @@
 maxdepth: 3
 ---
 
+    0.1.1 <release/0.1.1-notes>
     0.1.0 <release/0.1.0-notes>
-    0.2.0 <release/0.2.0-notes>
 ```

From 14b9c6766160bcb23227bc303117d9137f8569e0 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 10:53:20 -0800
Subject: [PATCH 041/111] fix the test to not use a global function, which was
 causing swallowed link errors

---
 cuda_core/tests/test_linker.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 7db6ed9f..1cb444fb 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,15 +4,15 @@
 from cuda.core.experimental._module import ObjectCode
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
-empty_entrypoint_kernel = "__global__ void A() {}"
-empty_kernel = "__device__ void B() {}"
+empty_kernel = "__device__ void A() {}"
+basic_kernel = "__device__ int B() { return 0; }"
 addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
 
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
-    object_code_a_ptx = Program(empty_entrypoint_kernel, "c++").compile("ptx")
-    object_code_b_ptx = Program(empty_kernel, "c++").compile("ptx")
+    object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx")
+    object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx")
     object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx")
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
@@ -20,8 +20,8 @@ def compile_ptx_functions(init_cuda):
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(empty_entrypoint_kernel, "c++").compile("ltoir", options=("-dlto",))
-    object_code_b_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",))
     object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",))
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir

From 27ec6d3dabd76238da0974c945c65b7c81ae7c22 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 13:26:01 -0800
Subject: [PATCH 042/111] add release notes

---
 cuda_core/docs/source/release.md             | 1 +
 cuda_core/docs/source/release/0.1.1-notes.md | 0
 2 files changed, 1 insertion(+)
 create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md

diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e24786..55090b0b 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -5,5 +5,6 @@
 maxdepth: 3
 ---
 
+    0.1.1 <release/0.1.1-notes>
     0.1.0 <release/0.1.0-notes>
 ```
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
new file mode 100644
index 00000000..e69de29b

From 42c4b45241f2c5f08ca96c7560788fec769ef1c0 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 13:28:58 -0800
Subject: [PATCH 043/111] make true the default path

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 889c20a0..88676cf6 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -64,10 +64,10 @@ def __new__(cls, device_id=None):
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
                     # use the SynchronousMemoryResource which does not use memory pools.
-                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-                    else:
+                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1:
                         dev._mr = _DefaultAsyncMempool(dev_id)
+                    else:
+                        dev._mr = _SynchronousMemoryResource(dev_id)
 
                     dev._has_inited = False
                     _tls.devices.append(dev)

From 64b1f22e9fae282739c6cf9aaf4005a0f289914b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 15:38:41 -0800
Subject: [PATCH 044/111] minor rewording

---
 cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
index e69de29b..d80e6ef4 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -0,0 +1,7 @@
+# `cuda.core` Release notes
+
+Released on Dec X, 2024
+
+## Hightlights
+- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools
+

From a7f8c309ad84245b26333062c473baf5326ae191 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 4 Dec 2024 01:14:28 +0000
Subject: [PATCH 045/111] WIP: enable cuLink APIs from driver

---
 cuda_core/cuda/core/experimental/_linker.py | 253 +++++++++++++++-----
 cuda_core/tests/test_linker.py              |  32 +--
 2 files changed, 209 insertions(+), 76 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index bb66adde..57a10866 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -2,13 +2,64 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import ctypes
 import weakref
 from dataclasses import dataclass
 from typing import List, Optional
 
-from cuda.bindings import nvjitlink
+from cuda import cuda
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import check_or_create_options
+from cuda.core.experimental._utils import check_or_create_options, handle_return
+
+# TODO: revisit this treatment for py313t builds
+_driver = None  # populated if nvJitLink cannot be used
+_driver_input_types = None  # populated if nvJitLink cannot be used
+_driver_ver = None
+_inited = False
+_nvjitlink = None  # populated if nvJitLink can be used
+_nvjitlink_input_types = None  # populated if nvJitLink cannot be used
+
+
+def _lazy_init():
+    global _inited
+    if _inited:
+        return
+
+    global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types
+    _driver_ver = handle_return(cuda.cuDriverGetVersion())
+    _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
+    try:
+        from cuda.bindings import nvjitlink
+        from cuda.bindings._internal import nvjitlink as inner_nvjitlink
+    except ImportError:
+        # binding is not available
+        nvjitlink = None
+    else:
+        if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
+            # binding is available, but nvJitLink is not installed
+            nvjitlink = None
+        elif _driver_ver > nvjitlink.version():
+            # TODO: nvJitLink is not new enough, warn?
+            pass
+    if nvjitlink:
+        _nvjitlink = nvjitlink
+        _nvjitlink_input_types = {
+            "ptx": _nvjitlink.InputType.PTX,
+            "cubin": _nvjitlink.InputType.CUBIN,
+            "fatbin": _nvjitlink.InputType.FATBIN,
+            "ltoir": _nvjitlink.InputType.LTOIR,
+            "object": _nvjitlink.InputType.OBJECT,
+        }
+    else:
+        from cuda import cuda as _driver
+
+        _driver_input_types = {
+            "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX,
+            "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN,
+            "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY,
+            "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT,
+        }
+    _inited = True
 
 
 @dataclass
@@ -146,7 +197,14 @@ class LinkerOptions:
     no_cache: Optional[bool] = None
 
     def __post_init__(self):
+        _lazy_init()
         self.formatted_options = []
+        if _nvjitlink:
+            self._init_nvjitlink()
+        else:
+            self._init_driver()
+
+    def _init_nvjitlink(self):
         if self.arch is not None:
             self.formatted_options.append(f"-arch={self.arch}")
         if self.max_register_count is not None:
@@ -191,6 +249,67 @@ def __post_init__(self):
         if self.no_cache is not None:
             self.formatted_options.append("-no-cache")
 
+    def _init_driver(self):
+        self.option_keys = []
+        # allocate 4 KiB each for info/error logs
+        size = 4194304
+        self.formatted_options.extend((bytearray(size), size, bytearray(size), size))
+        self.option_keys.extend(
+            (
+                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER,
+                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER,
+                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+            )
+        )
+
+        if self.arch is not None:
+            arch = self.arch.split("_")[-1].upper()
+            self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}"))
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET)
+        # if self.max_register_count is not None:
+        #    self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
+        # if self.time is not None:
+        #    self.formatted_options.append("-time")
+        if self.verbose is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1))
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE)
+        # if self.link_time_optimization is not None:
+        #    self.formatted_options.append("-lto")
+        # if self.ptx is not None:
+        #    self.formatted_options.append("-ptx")
+        # if self.optimization_level is not None:
+        #    self.formatted_options.append(f"-O{self.optimization_level}")
+        # if self.debug is not None:
+        #    self.formatted_options.append("-g")
+        # if self.lineinfo is not None:
+        #    self.formatted_options.append("-lineinfo")
+        # if self.ftz is not None:
+        #    self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}")
+        # if self.prec_div is not None:
+        #    self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}")
+        # if self.prec_sqrt is not None:
+        #    self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}")
+        # if self.fma is not None:
+        #    self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}")
+        # if self.kernels_used is not None:
+        #    for kernel in self.kernels_used:
+        #        self.formatted_options.append(f"-kernels-used={kernel}")
+        # if self.variables_used is not None:
+        #    for variable in self.variables_used:
+        #        self.formatted_options.append(f"-variables-used={variable}")
+        # if self.optimize_unused_variables is not None:
+        #    self.formatted_options.append("-optimize-unused-variables")
+        # if self.xptxas is not None:
+        #    for opt in self.xptxas:
+        #        self.formatted_options.append(f"-Xptxas={opt}")
+        # if self.split_compile is not None:
+        #    self.formatted_options.append(f"-split-compile={self.split_compile}")
+        # if self.split_compile_extended is not None:
+        #    self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
+        # if self.no_cache is not None:
+        #    self.formatted_options.append("-no-cache")
+
 
 class Linker:
     """
@@ -202,45 +321,41 @@ class Linker:
         One or more ObjectCode objects to be linked.
     options : LinkerOptions, optional
         Options for the linker. If not provided, default options will be used.
-
-    Attributes
-    ----------
-    _options : LinkerOptions
-        The options used for the linker.
-    _handle : handle
-        The handle to the linker created by nvjitlink.
-
-    Methods
-    -------
-    _add_code_object(object_code)
-        Adds an object code to the linker.
-    close()
-        Closes the linker and releases resources.
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = ("handle",)
+        __slots__ = ("handle", "use_nvjitlink")
 
-        def __init__(self, program_obj, handle):
+        def __init__(self, program_obj, handle, use_nvjitlink):
             self.handle = handle
+            self.use_nvjitlink = use_nvjitlink
             weakref.finalize(program_obj, self.close)
 
         def close(self):
             if self.handle is not None:
-                nvjitlink.destroy(self.handle)
+                if self.use_nvjitlink:
+                    _nvjitlink.destroy(self.handle)
+                else:
+                    handle_return(_driver.cuLinkDestroy(self.handle))
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_options")
 
     def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
-        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        self._mnff = Linker._MembersNeededForFinalize(
-            self, nvjitlink.create(len(options.formatted_options), options.formatted_options)
-        )
-
         if len(object_codes) == 0:
             raise ValueError("At least one ObjectCode object must be provided")
 
+        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
+        if _nvjitlink:
+            handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options)
+            use_nvjitlink = True
+        else:
+            handle = handle_return(
+                _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
+            )
+            use_nvjitlink = False
+        self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
+
         for code in object_codes:
             assert isinstance(code, ObjectCode)
             self._add_code_object(code)
@@ -248,56 +363,74 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
-        nvjitlink.add_data(
-            self._mnff.handle,
-            self._input_type_from_code_type(object_code._code_type),
-            data,
-            len(data),
-            f"{object_code._handle}_{object_code._code_type}",
-        )
-
-    _get_linked_methods = {
-        "cubin": (nvjitlink.get_linked_cubin_size, nvjitlink.get_linked_cubin),
-        "ptx": (nvjitlink.get_linked_ptx_size, nvjitlink.get_linked_ptx),
-    }
+        if _nvjitlink:
+            _nvjitlink.add_data(
+                self._mnff.handle,
+                self._input_type_from_code_type(object_code._code_type),
+                data,
+                len(data),
+                f"{object_code._handle}_{object_code._code_type}",
+            )
+        else:
+            handle_return(
+                _driver.cuLinkAddData(
+                    self._mnff.handle,
+                    self._input_type_from_code_type(object_code._code_type),
+                    data,
+                    len(data),
+                    f"{object_code._handle}_{object_code._code_type}".encode(),
+                    0,
+                    None,
+                    None,
+                )
+            )
 
     def link(self, target_type) -> ObjectCode:
-        nvjitlink.complete(self._mnff.handle)
-        get_linked = self._get_linked_methods.get(target_type)
-        if get_linked is None:
+        if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
+        if _nvjitlink:
+            _nvjitlink.complete(self._mnff.handle)
+            if target_type == "cubin":
+                get_size = _nvjitlink.get_linked_cubin_size
+                get_code = _nvjitlink.get_linked_cubin
+            else:
+                get_size = _nvjitlink.get_linked_ptx_size
+                get_code = _nvjitlink.get_linked_ptx
 
-        get_size, get_code = get_linked
-        size = get_size(self._mnff.handle)
-        code = bytearray(size)
-        get_code(self._mnff.handle, code)
+            size = get_size(self._mnff.handle)
+            code = bytearray(size)
+            get_code(self._mnff.handle, code)
+        else:
+            addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
+            code = (ctypes.c_char * size).from_address(addr)
 
         return ObjectCode(bytes(code), target_type)
 
     def get_error_log(self) -> str:
-        log_size = nvjitlink.get_error_log_size(self._mnff.handle)
-        log = bytearray(log_size)
-        nvjitlink.get_error_log(self._mnff.handle, log)
+        if _nvjitlink:
+            log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
+            log = bytearray(log_size)
+            _nvjitlink.get_error_log(self._mnff.handle, log)
+        else:
+            log = self._options.formatted_options[2]
         return log.decode()
 
     def get_info_log(self) -> str:
-        log_size = nvjitlink.get_info_log_size(self._mnff.handle)
-        log = bytearray(log_size)
-        nvjitlink.get_info_log(self._mnff.handle, log)
+        if _nvjitlink:
+            log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
+            log = bytearray(log_size)
+            _nvjitlink.get_info_log(self._mnff.handle, log)
+        else:
+            log = self._options.formatted_options[0]
         return log.decode()
 
-    _input_types = {
-        "ptx": nvjitlink.InputType.PTX,
-        "cubin": nvjitlink.InputType.CUBIN,
-        "fatbin": nvjitlink.InputType.FATBIN,
-        "ltoir": nvjitlink.InputType.LTOIR,
-        "object": nvjitlink.InputType.OBJECT,
-    }
-
-    def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType:
+    def _input_type_from_code_type(self, code_type: str):
         # this list is based on the supported values for code_type in the ObjectCode class definition.
-        # nvjitlink supports other options for input type
-        input_type = self._input_types.get(code_type)
+        # nvJitLink/driver support other options for input type
+        if _nvjitlink:
+            input_type = _nvjitlink_input_types.get(code_type)
+        else:
+            input_type = _driver_input_types.get(code_type)
 
         if input_type is None:
             raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 7db6ed9f..4d10f423 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -31,23 +31,23 @@ def compile_ltoir_functions(init_cuda):
     "options",
     [
         LinkerOptions(arch=ARCH),
-        LinkerOptions(arch=ARCH, max_register_count=32),
-        LinkerOptions(arch=ARCH, time=True),
+        # LinkerOptions(arch=ARCH, max_register_count=32),
+        # LinkerOptions(arch=ARCH, time=True),
         LinkerOptions(arch=ARCH, verbose=True),
-        LinkerOptions(arch=ARCH, optimization_level=3),
-        LinkerOptions(arch=ARCH, debug=True),
-        LinkerOptions(arch=ARCH, lineinfo=True),
-        LinkerOptions(arch=ARCH, ftz=True),
-        LinkerOptions(arch=ARCH, prec_div=True),
-        LinkerOptions(arch=ARCH, prec_sqrt=True),
-        LinkerOptions(arch=ARCH, fma=True),
-        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
-        LinkerOptions(arch=ARCH, variables_used=["var1"]),
-        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
-        LinkerOptions(arch=ARCH, xptxas=["-v"]),
-        LinkerOptions(arch=ARCH, split_compile=0),
-        LinkerOptions(arch=ARCH, split_compile_extended=1),
-        LinkerOptions(arch=ARCH, no_cache=True),
+        # LinkerOptions(arch=ARCH, optimization_level=3),
+        # LinkerOptions(arch=ARCH, debug=True),
+        # LinkerOptions(arch=ARCH, lineinfo=True),
+        # LinkerOptions(arch=ARCH, ftz=True),
+        # LinkerOptions(arch=ARCH, prec_div=True),
+        # LinkerOptions(arch=ARCH, prec_sqrt=True),
+        # LinkerOptions(arch=ARCH, fma=True),
+        # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
+        # LinkerOptions(arch=ARCH, variables_used=["var1"]),
+        # LinkerOptions(arch=ARCH, optimize_unused_variables=True),
+        # LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        # LinkerOptions(arch=ARCH, split_compile=0),
+        # LinkerOptions(arch=ARCH, split_compile_extended=1),
+        # LinkerOptions(arch=ARCH, no_cache=True),
     ],
 )
 def test_linker_init(compile_ptx_functions, options):

From 028a5c234b4a40e6298ea0e0a4d950013e20ebf5 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 18:23:31 -0800
Subject: [PATCH 046/111] save progress to remote

---
 cuda_core/cuda/core/experimental/_linker.py | 104 +++++++++++---------
 cuda_core/tests/test_linker.py              |  32 +++---
 2 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 57a10866..304b3771 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -29,6 +29,7 @@ def _lazy_init():
     _driver_ver = handle_return(cuda.cuDriverGetVersion())
     _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
     try:
+        raise ImportError
         from cuda.bindings import nvjitlink
         from cuda.bindings._internal import nvjitlink as inner_nvjitlink
     except ImportError:
@@ -267,48 +268,66 @@ def _init_driver(self):
             arch = self.arch.split("_")[-1].upper()
             self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}"))
             self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET)
-        # if self.max_register_count is not None:
-        #    self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
-        # if self.time is not None:
-        #    self.formatted_options.append("-time")
+        if self.max_register_count is not None:
+            self.formatted_options.append(self.max_register_count)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS)
+        if self.time is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME)
         if self.verbose is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1))
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE)
-        # if self.link_time_optimization is not None:
-        #    self.formatted_options.append("-lto")
-        # if self.ptx is not None:
-        #    self.formatted_options.append("-ptx")
-        # if self.optimization_level is not None:
-        #    self.formatted_options.append(f"-O{self.optimization_level}")
-        # if self.debug is not None:
-        #    self.formatted_options.append("-g")
-        # if self.lineinfo is not None:
-        #    self.formatted_options.append("-lineinfo")
-        # if self.ftz is not None:
-        #    self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}")
-        # if self.prec_div is not None:
-        #    self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}")
-        # if self.prec_sqrt is not None:
-        #    self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}")
-        # if self.fma is not None:
-        #    self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}")
-        # if self.kernels_used is not None:
-        #    for kernel in self.kernels_used:
-        #        self.formatted_options.append(f"-kernels-used={kernel}")
-        # if self.variables_used is not None:
-        #    for variable in self.variables_used:
-        #        self.formatted_options.append(f"-variables-used={variable}")
-        # if self.optimize_unused_variables is not None:
-        #    self.formatted_options.append("-optimize-unused-variables")
-        # if self.xptxas is not None:
-        #    for opt in self.xptxas:
-        #        self.formatted_options.append(f"-Xptxas={opt}")
-        # if self.split_compile is not None:
-        #    self.formatted_options.append(f"-split-compile={self.split_compile}")
-        # if self.split_compile_extended is not None:
-        #    self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
-        # if self.no_cache is not None:
-        #    self.formatted_options.append("-no-cache")
+        if self.link_time_optimization is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO)
+        if self.ptx is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
+        if self.optimization_level is not None:
+            self.formatted_options.append(self.optimization_level)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL)
+        if self.debug is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO)
+        if self.lineinfo is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
+        if self.ftz is not None:
+            self.formatted_options.append(1 if self.ftz else 0)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ)
+        if self.prec_div is not None:
+            self.formatted_options.append(1 if self.prec_div else 0)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV)
+        if self.prec_sqrt is not None:
+            self.formatted_options.append(1 if self.prec_sqrt else 0)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT)
+        if self.fma is not None:
+            self.formatted_options.append(1 if self.fma else 0)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA)
+        if self.kernels_used is not None:
+            for kernel in self.kernels_used:
+                self.formatted_options.append(kernel)
+                self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES)
+        if self.variables_used is not None:
+            for variable in self.variables_used:
+                self.formatted_options.append(variable)
+                self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES)
+        if self.optimize_unused_variables is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES)
+        if self.xptxas is not None:
+            for opt in self.xptxas:
+                self.formatted_options.append(opt)
+                self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE)
+        if self.split_compile is not None:
+            self.formatted_options.append(self.split_compile)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK)
+        if self.split_compile_extended is not None:
+            self.formatted_options.append(self.split_compile_extended)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM)
+        if self.no_cache is not None:
+            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
 
 
 class Linker:
@@ -427,10 +446,7 @@ def get_info_log(self) -> str:
     def _input_type_from_code_type(self, code_type: str):
         # this list is based on the supported values for code_type in the ObjectCode class definition.
         # nvJitLink/driver support other options for input type
-        if _nvjitlink:
-            input_type = _nvjitlink_input_types.get(code_type)
-        else:
-            input_type = _driver_input_types.get(code_type)
+        input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type)
 
         if input_type is None:
             raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index ac7a5012..1851c7ba 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -31,22 +31,22 @@ def compile_ltoir_functions(init_cuda):
     "options",
     [
         LinkerOptions(arch=ARCH),
-        # LinkerOptions(arch=ARCH, max_register_count=32),
-        # LinkerOptions(arch=ARCH, time=True),
+        LinkerOptions(arch=ARCH, max_register_count=32),
+        LinkerOptions(arch=ARCH, time=True),
         LinkerOptions(arch=ARCH, verbose=True),
-        # LinkerOptions(arch=ARCH, optimization_level=3),
-        # LinkerOptions(arch=ARCH, debug=True),
-        # LinkerOptions(arch=ARCH, lineinfo=True),
-        # LinkerOptions(arch=ARCH, ftz=True),
-        # LinkerOptions(arch=ARCH, prec_div=True),
-        # LinkerOptions(arch=ARCH, prec_sqrt=True),
-        # LinkerOptions(arch=ARCH, fma=True),
+        LinkerOptions(arch=ARCH, optimization_level=3),
+        LinkerOptions(arch=ARCH, debug=True),
+        LinkerOptions(arch=ARCH, lineinfo=True),
+        LinkerOptions(arch=ARCH, ftz=True),
+        LinkerOptions(arch=ARCH, prec_div=True),
+        LinkerOptions(arch=ARCH, prec_sqrt=True),
+        LinkerOptions(arch=ARCH, fma=True),
         # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
         # LinkerOptions(arch=ARCH, variables_used=["var1"]),
-        # LinkerOptions(arch=ARCH, optimize_unused_variables=True),
+        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
         # LinkerOptions(arch=ARCH, xptxas=["-v"]),
         # LinkerOptions(arch=ARCH, split_compile=0),
-        # LinkerOptions(arch=ARCH, split_compile_extended=1),
+        LinkerOptions(arch=ARCH, split_compile_extended=1),
         # LinkerOptions(arch=ARCH, no_cache=True),
     ],
 )
@@ -62,11 +62,11 @@ def test_linker_init_invalid_arch():
         Linker(options)
 
 
-def test_linker_link_ptx(compile_ltoir_functions):
-    options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
-    linker = Linker(*compile_ltoir_functions, options=options)
-    linked_code = linker.link("ptx")
-    assert isinstance(linked_code, ObjectCode)
+# def test_linker_link_ptx(compile_ltoir_functions):
+#     options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
+#     linker = Linker(*compile_ltoir_functions, options=options)
+#     linked_code = linker.link("ptx")
+#     assert isinstance(linked_code, ObjectCode)
 
 
 def test_linker_link_cubin(compile_ptx_functions):

From d7bf4cb304404d6b001fa0e5df479a6d1f9fd514 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 18:28:38 -0800
Subject: [PATCH 047/111] save progress to remote

---
 cuda_core/cuda/core/experimental/_linker.py | 10 +++-------
 cuda_core/tests/test_linker.py              |  4 ++--
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 304b3771..79328583 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -306,22 +306,18 @@ def _init_driver(self):
             self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA)
         if self.kernels_used is not None:
             for kernel in self.kernels_used:
-                self.formatted_options.append(kernel)
+                self.formatted_options.append(kernel.encode())
                 self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES)
         if self.variables_used is not None:
             for variable in self.variables_used:
-                self.formatted_options.append(variable)
+                self.formatted_options.append(variable.encode())
                 self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES)
         if self.optimize_unused_variables is not None:
             self.formatted_options.append(1)  # ctypes.c_int32(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES)
         if self.xptxas is not None:
             for opt in self.xptxas:
-                self.formatted_options.append(opt)
-                self.option_keys.append(_driver.CUjit_option.CU_JIT_FAST_COMPILE)
-        if self.split_compile is not None:
-            self.formatted_options.append(self.split_compile)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_THREADS_PER_BLOCK)
+                raise NotImplementedError("TODO: implement xptxas option")
         if self.split_compile_extended is not None:
             self.formatted_options.append(self.split_compile_extended)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 1851c7ba..3937c878 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -41,8 +41,8 @@ def compile_ltoir_functions(init_cuda):
         LinkerOptions(arch=ARCH, prec_div=True),
         LinkerOptions(arch=ARCH, prec_sqrt=True),
         LinkerOptions(arch=ARCH, fma=True),
-        # LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
-        # LinkerOptions(arch=ARCH, variables_used=["var1"]),
+        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
+        LinkerOptions(arch=ARCH, variables_used=["var1"]),
         LinkerOptions(arch=ARCH, optimize_unused_variables=True),
         # LinkerOptions(arch=ARCH, xptxas=["-v"]),
         # LinkerOptions(arch=ARCH, split_compile=0),

From 84124b590f62c089503b5ab44a9608a83a151ac4 Mon Sep 17 00:00:00 2001
From: sandeepd-nv <sandeepd@nvidia.com>
Date: Wed, 4 Dec 2024 09:49:20 +0530
Subject: [PATCH 048/111] Run cuda_core tests before cuda_binding.

---
 continuous_integration/scripts/test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
index cbee6998..3a705c3c 100755
--- a/continuous_integration/scripts/test
+++ b/continuous_integration/scripts/test
@@ -11,10 +11,10 @@ test_ci() {
     cd "${CORE_ARTIFACTS_DIR}"
     pip install *.whl
 
-    cd "${REPO_DIR}/cuda_bindings"
+    cd "${REPO_DIR}/cuda_core"
     python -m pytest tests/
 
-    cd "${REPO_DIR}/cuda_core"
+    cd "${REPO_DIR}/cuda_bindings"
     python -m pytest tests/
 
 }

From 1d80ca70f1ef4d0435d4aa49ee97d9ec8b254588 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 08:48:38 -0800
Subject: [PATCH 049/111] fix some known issues before colossus test

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++++-
 cuda_core/cuda/core/experimental/_memory.py | 8 ++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 88676cf6..db5f57cf 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -64,7 +64,11 @@ def __new__(cls, device_id=None):
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
                     # use the SynchronousMemoryResource which does not use memory pools.
-                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1:
+                    if (
+                        handle_return(
+                            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                        )
+                    ) == 1:
                         dev._mr = _DefaultAsyncMempool(dev_id)
                     else:
                         dev._mr = _SynchronousMemoryResource(dev_id)
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index ac6a78fe..5ff00ba2 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -303,15 +303,11 @@ def __init__(self, dev_id):
         self._dev_id = dev_id
 
     def allocate(self, size, stream=None) -> Buffer:
-        if stream is None:
-            stream = default_stream()
-        ptr = handle_return(cuda.cuMemAlloc(size, stream._handle))
+        ptr = handle_return(cuda.cuMemAlloc(size))
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
-        if stream is None:
-            stream = default_stream()
-        handle_return(cuda.cuMemFree(ptr, stream._handle))
+        handle_return(cuda.cuMemFree(ptr))
 
     @property
     def is_device_accessible(self) -> bool:

From cd7f146bf0f8f13a0327fa8d2b0a315410819350 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 4 Dec 2024 10:23:20 -0800
Subject: [PATCH 050/111] convert line endings from CRLF to LF

---
 .gitattributes                                |   2 +
 .pre-commit-config.yaml                       |  24 +-
 .../benchmarks/test_launch_latency.py         | 682 +++++++++---------
 cuda_bindings/tests/test_nvjitlink.py         | 336 ++++-----
 .../example_tests/test_basic_examples.py      |  50 +-
 cuda_core/tests/example_tests/utils.py        | 112 +--
 cuda_core/tests/test_device.py                | 160 ++--
 cuda_core/tests/test_event.py                 |  92 +--
 cuda_core/tests/test_launcher.py              | 136 ++--
 cuda_core/tests/test_memory.py                | 426 +++++------
 cuda_core/tests/test_module.py                |  96 +--
 cuda_core/tests/test_program.py               | 132 ++--
 cuda_core/tests/test_stream.py                | 230 +++---
 13 files changed, 1240 insertions(+), 1238 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index 00407cdc..aeb32006 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -5,3 +5,5 @@ cuda/_version.py export-subst
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
+# git should not convert line endings in PNG files
+*.png binary
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 431bb7c5..c2d246aa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.4
-    hooks:
-      - id: ruff
-        args: [--fix, --show-fixes]
-      - id: ruff-format
-
-default_language_version:
-      python: python3
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.4
+    hooks:
+      - id: ruff
+        args: [--fix, --show-fixes]
+      - id: ruff-format
+
+default_language_version:
+      python: python3
diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py
index 8d70bfe2..f16e971a 100755
--- a/cuda_bindings/benchmarks/test_launch_latency.py
+++ b/cuda_bindings/benchmarks/test_launch_latency.py
@@ -1,341 +1,341 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-import ctypes
-
-import pytest
-
-from cuda import cuda
-
-from .kernels import kernel_string
-from .perf_test_utils import ASSERT_DRV
-
-
-def launch(kernel, stream, args=(), arg_types=()):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        (args, arg_types),
-        0,
-    )  # arguments
-
-
-def launch_packed(kernel, stream, params):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        params,
-        0,
-    )  # arguments
-
-
-# Measure launch latency with no parmaeters
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with a single parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
-    ASSERT_DRV(err)
-
-    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
-
-    (err,) = cuda.cuMemFree(f)
-    ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 512
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
-    ASSERT_DRV(err)
-
-    args = [True] * 512
-    arg_types = [ctypes.c_bool] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
-    ASSERT_DRV(err)
-
-    args = [1.2345] * 512
-    arg_types = [ctypes.c_double] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
-    ASSERT_DRV(err)
-
-    args = [123] * 512
-    arg_types = [ctypes.c_int] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
-    ASSERT_DRV(err)
-
-    args = [127] * 512
-    arg_types = [ctypes.c_byte] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
-    ASSERT_DRV(err)
-
-    args = [9223372036854775806] * 512
-    arg_types = [ctypes.c_longlong] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 256
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 16
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters, excluding parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    val_ps = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-        val_ps.append(ctypes.c_void_p(int(vals[i])))
-
-    packagedParams = (ctypes.c_void_p * 512)()
-    for i in range(512):
-        packagedParams[i] = ctypes.addressof(val_ps[i])
-
-    benchmark(launch_packed, func, stream, packagedParams)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-def pack_and_launch(kernel, stream, params):
-    packed_params = (ctypes.c_void_p * len(params))()
-    ptrs = [0] * len(params)
-    for i in range(len(params)):
-        ptrs[i] = ctypes.c_void_p(int(params[i]))
-        packed_params[i] = ctypes.addressof(ptrs[i])
-
-    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
-
-
-# Measure launch latency plus parameter packing using ctypes
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-
-    benchmark(pack_and_launch, func, stream, vals)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with a single large struct parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
-    ASSERT_DRV(err)
-
-    class struct_2048B(ctypes.Structure):
-        _fields_ = [("values", ctypes.c_uint8 * 2048)]
-
-    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
+# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+import ctypes
+
+import pytest
+
+from cuda import cuda
+
+from .kernels import kernel_string
+from .perf_test_utils import ASSERT_DRV
+
+
+def launch(kernel, stream, args=(), arg_types=()):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (args, arg_types),
+        0,
+    )  # arguments
+
+
+def launch_packed(kernel, stream, params):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        params,
+        0,
+    )  # arguments
+
+
+# Measure launch latency with no parmaeters
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with a single parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
+    ASSERT_DRV(err)
+
+    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
+
+    (err,) = cuda.cuMemFree(f)
+    ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 512
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
+    ASSERT_DRV(err)
+
+    args = [True] * 512
+    arg_types = [ctypes.c_bool] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
+    ASSERT_DRV(err)
+
+    args = [1.2345] * 512
+    arg_types = [ctypes.c_double] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
+    ASSERT_DRV(err)
+
+    args = [123] * 512
+    arg_types = [ctypes.c_int] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
+    ASSERT_DRV(err)
+
+    args = [127] * 512
+    arg_types = [ctypes.c_byte] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
+    ASSERT_DRV(err)
+
+    args = [9223372036854775806] * 512
+    arg_types = [ctypes.c_longlong] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 256
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 16
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters, excluding parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    val_ps = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+        val_ps.append(ctypes.c_void_p(int(vals[i])))
+
+    packagedParams = (ctypes.c_void_p * 512)()
+    for i in range(512):
+        packagedParams[i] = ctypes.addressof(val_ps[i])
+
+    benchmark(launch_packed, func, stream, packagedParams)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+def pack_and_launch(kernel, stream, params):
+    packed_params = (ctypes.c_void_p * len(params))()
+    ptrs = [0] * len(params)
+    for i in range(len(params)):
+        ptrs[i] = ctypes.c_void_p(int(params[i]))
+        packed_params[i] = ctypes.addressof(ptrs[i])
+
+    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
+
+
+# Measure launch latency plus parameter packing using ctypes
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+
+    benchmark(pack_and_launch, func, stream, vals)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with a single large struct parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
+    ASSERT_DRV(err)
+
+    class struct_2048B(ctypes.Structure):
+        _fields_ = [("values", ctypes.c_uint8 * 2048)]
+
+    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index d92a3ca7..839c7be1 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,168 +1,168 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import pytest
-
-from cuda.bindings import nvjitlink, nvrtc
-
-# Establish a handful of compatible architectures and PTX versions to test with
-ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"]
-PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"]
-
-
-def ptx_header(version, arch):
-    return f"""
-.version {version}
-.target {arch}
-.address_size 64
-"""
-
-
-ptx_kernel = """
-.visible .entry _Z6kernelPi(
-    .param .u64 _Z6kernelPi_param_0
-)
-{
-    .reg .pred  %p<2>;
-    .reg .b32   %r<3>;
-    .reg .b64   %rd<3>;
-
-    ld.param.u64    %rd1, [_Z6kernelPi_param_0];
-    cvta.to.global.u64  %rd2, %rd1;
-    mov.u32     %r1, %tid.x;
-    st.global.u32   [%rd2+0], %r1;
-    ret;
-}
-"""
-
-minimal_ptx_kernel = """
-.func _MinimalKernel()
-{
-    ret;
-}
-"""
-
-ptx_kernel_bytes = [
-    (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
-]
-minimal_ptx_kernel_bytes = [
-    (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8")
-    for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
-]
-
-
-# create a valid LTOIR input for testing
-@pytest.fixture
-def get_dummy_ltoir():
-    def CHECK_NVRTC(err):
-        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError(f"Nvrtc Error: {err}")
-
-    empty_cplusplus_kernel = "__global__ void A() {}"
-    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
-    CHECK_NVRTC(err)
-    nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"])
-    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
-    CHECK_NVRTC(err)
-    empty_kernel_ltoir = b" " * size
-    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
-    CHECK_NVRTC(err)
-    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
-    CHECK_NVRTC(err)
-    return empty_kernel_ltoir
-
-
-def test_unrecognized_option_error():
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-fictitious_option"])
-
-
-def test_invalid_arch_error():
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-arch=sm_XX"])
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_create_and_destroy(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    assert handle != 0
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_complete_empty(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_add_data(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_add_file(option, ptx_bytes, tmp_path):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    file_path = tmp_path / "test_file.cubin"
-    file_path.write_bytes(ptx_bytes)
-    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_get_error_log(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_error_log_size(handle)
-    log = bytearray(log_size)
-    nvjitlink.get_error_log(handle, log)
-    assert len(log) == log_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_get_info_log(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_info_log_size(handle)
-    log = bytearray(log_size)
-    nvjitlink.get_info_log(handle, log)
-    assert len(log) == log_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_get_linked_cubin(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    cubin_size = nvjitlink.get_linked_cubin_size(handle)
-    cubin = bytearray(cubin_size)
-    nvjitlink.get_linked_cubin(handle, cubin)
-    assert len(cubin) == cubin_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_get_linked_ptx(option, get_dummy_ltoir):
-    handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data")
-    nvjitlink.complete(handle)
-    ptx_size = nvjitlink.get_linked_ptx_size(handle)
-    ptx = bytearray(ptx_size)
-    nvjitlink.get_linked_ptx(handle, ptx)
-    assert len(ptx) == ptx_size
-    nvjitlink.destroy(handle)
-
-
-def test_package_version():
-    ver = nvjitlink.version()
-    assert len(ver) == 2
-    assert ver >= (12, 0)
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+
+from cuda.bindings import nvjitlink, nvrtc
+
+# Establish a handful of compatible architectures and PTX versions to test with
+ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"]
+PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"]
+
+
+def ptx_header(version, arch):
+    return f"""
+.version {version}
+.target {arch}
+.address_size 64
+"""
+
+
+ptx_kernel = """
+.visible .entry _Z6kernelPi(
+    .param .u64 _Z6kernelPi_param_0
+)
+{
+    .reg .pred  %p<2>;
+    .reg .b32   %r<3>;
+    .reg .b64   %rd<3>;
+
+    ld.param.u64    %rd1, [_Z6kernelPi_param_0];
+    cvta.to.global.u64  %rd2, %rd1;
+    mov.u32     %r1, %tid.x;
+    st.global.u32   [%rd2+0], %r1;
+    ret;
+}
+"""
+
+minimal_ptx_kernel = """
+.func _MinimalKernel()
+{
+    ret;
+}
+"""
+
+ptx_kernel_bytes = [
+    (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
+]
+minimal_ptx_kernel_bytes = [
+    (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8")
+    for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
+]
+
+
+# create a valid LTOIR input for testing
+@pytest.fixture
+def get_dummy_ltoir():
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(f"Nvrtc Error: {err}")
+
+    empty_cplusplus_kernel = "__global__ void A() {}"
+    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"])
+    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
+    CHECK_NVRTC(err)
+    empty_kernel_ltoir = b" " * size
+    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return empty_kernel_ltoir
+
+
+def test_unrecognized_option_error():
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-fictitious_option"])
+
+
+def test_invalid_arch_error():
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-arch=sm_XX"])
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_create_and_destroy(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    assert handle != 0
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_complete_empty(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_add_data(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_add_file(option, ptx_bytes, tmp_path):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    file_path = tmp_path / "test_file.cubin"
+    file_path.write_bytes(ptx_bytes)
+    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_get_error_log(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_error_log_size(handle)
+    log = bytearray(log_size)
+    nvjitlink.get_error_log(handle, log)
+    assert len(log) == log_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_get_info_log(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_info_log_size(handle)
+    log = bytearray(log_size)
+    nvjitlink.get_info_log(handle, log)
+    assert len(log) == log_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_get_linked_cubin(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    cubin_size = nvjitlink.get_linked_cubin_size(handle)
+    cubin = bytearray(cubin_size)
+    nvjitlink.get_linked_cubin(handle, cubin)
+    assert len(cubin) == cubin_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_get_linked_ptx(option, get_dummy_ltoir):
+    handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data")
+    nvjitlink.complete(handle)
+    ptx_size = nvjitlink.get_linked_ptx_size(handle)
+    ptx = bytearray(ptx_size)
+    nvjitlink.get_linked_ptx(handle, ptx)
+    assert len(ptx) == ptx_size
+    nvjitlink.destroy(handle)
+
+
+def test_package_version():
+    ver = nvjitlink.version()
+    assert len(ver) == 2
+    assert ver >= (12, 0)
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 9b94ecd3..9a9432cb 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -1,25 +1,25 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-# If we have subcategories of examples in the future, this file can be split along those lines
-
-import glob
-import os
-
-import pytest
-
-from .utils import run_example
-
-samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples")
-sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
-
-
-@pytest.mark.parametrize("example", sample_files)
-class TestExamples:
-    def test_example(self, example, deinit_cuda):
-        run_example(samples_path, example)
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+# If we have subcategories of examples in the future, this file can be split along those lines
+
+import glob
+import os
+
+import pytest
+
+from .utils import run_example
+
+samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples")
+sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
+
+
+@pytest.mark.parametrize("example", sample_files)
+class TestExamples:
+    def test_example(self, example, deinit_cuda):
+        run_example(samples_path, example)
diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py
index f6ac3e15..3d218a91 100644
--- a/cuda_core/tests/example_tests/utils.py
+++ b/cuda_core/tests/example_tests/utils.py
@@ -1,56 +1,56 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import gc
-import os
-import sys
-
-import cupy as cp
-import pytest
-
-
-class SampleTestError(Exception):
-    pass
-
-
-def parse_python_script(filepath):
-    if not filepath.endswith(".py"):
-        raise ValueError(f"{filepath} not supported")
-    with open(filepath, encoding="utf-8") as f:
-        script = f.read()
-    return script
-
-
-def run_example(samples_path, filename, env=None):
-    fullpath = os.path.join(samples_path, filename)
-    script = parse_python_script(fullpath)
-    try:
-        old_argv = sys.argv
-        sys.argv = [fullpath]
-        old_sys_path = sys.path.copy()
-        sys.path.append(samples_path)
-        exec(script, env if env else {})
-    except ImportError as e:
-        # for samples requiring any of optional dependencies
-        for m in ("cupy",):
-            if f"No module named '{m}'" in str(e):
-                pytest.skip(f"{m} not installed, skipping related tests")
-                break
-        else:
-            raise
-    except Exception as e:
-        msg = "\n"
-        msg += f"Got error ({filename}):\n"
-        msg += str(e)
-        raise SampleTestError(msg) from e
-    finally:
-        sys.path = old_sys_path
-        sys.argv = old_argv
-        # further reduce the memory watermark
-        gc.collect()
-        cp.get_default_memory_pool().free_all_blocks()
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import gc
+import os
+import sys
+
+import cupy as cp
+import pytest
+
+
+class SampleTestError(Exception):
+    pass
+
+
+def parse_python_script(filepath):
+    if not filepath.endswith(".py"):
+        raise ValueError(f"{filepath} not supported")
+    with open(filepath, encoding="utf-8") as f:
+        script = f.read()
+    return script
+
+
+def run_example(samples_path, filename, env=None):
+    fullpath = os.path.join(samples_path, filename)
+    script = parse_python_script(fullpath)
+    try:
+        old_argv = sys.argv
+        sys.argv = [fullpath]
+        old_sys_path = sys.path.copy()
+        sys.path.append(samples_path)
+        exec(script, env if env else {})
+    except ImportError as e:
+        # for samples requiring any of optional dependencies
+        for m in ("cupy",):
+            if f"No module named '{m}'" in str(e):
+                pytest.skip(f"{m} not installed, skipping related tests")
+                break
+        else:
+            raise
+    except Exception as e:
+        msg = "\n"
+        msg += f"Got error ({filename}):\n"
+        msg += str(e)
+        raise SampleTestError(msg) from e
+    finally:
+        sys.path = old_sys_path
+        sys.argv = old_argv
+        # further reduce the memory watermark
+        gc.collect()
+        cp.get_default_memory_pool().free_all_blocks()
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index afc3ed5b..876299f3 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -1,80 +1,80 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-try:
-    from cuda.bindings import driver, runtime
-except ImportError:
-    from cuda import cuda as driver
-    from cuda import cudart as runtime
-
-from cuda.core.experimental import Device
-from cuda.core.experimental._utils import ComputeCapability, handle_return
-
-
-def test_device_set_current(deinit_cuda):
-    device = Device()
-    device.set_current()
-    assert handle_return(driver.cuCtxGetCurrent()) is not None
-
-
-def test_device_repr():
-    device = Device(0)
-    assert str(device).startswith("<Device 0")
-
-
-def test_device_alloc(init_cuda):
-    device = Device()
-    buffer = device.allocate(1024)
-    device.sync()
-    assert buffer.handle != 0
-    assert buffer.size == 1024
-    assert buffer.device_id == 0
-
-
-def test_device_create_stream(init_cuda):
-    device = Device()
-    stream = device.create_stream()
-    assert stream is not None
-    assert stream.handle
-
-
-def test_pci_bus_id():
-    device = Device()
-    bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, device.device_id))
-    assert device.pci_bus_id == bus_id[:12].decode()
-
-
-def test_uuid():
-    device = Device()
-    driver_ver = handle_return(driver.cuDriverGetVersion())
-    if driver_ver >= 11040:
-        uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id))
-    else:
-        uuid = handle_return(driver.cuDeviceGetUuid(device.device_id))
-    uuid = uuid.bytes.hex()
-    expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
-    assert device.uuid == expected_uuid
-
-
-def test_name():
-    device = Device()
-    name = handle_return(driver.cuDeviceGetName(128, device.device_id))
-    name = name.split(b"\0")[0]
-    assert device.name == name.decode()
-
-
-def test_compute_capability():
-    device = Device()
-    major = handle_return(
-        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)
-    )
-    minor = handle_return(
-        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)
-    )
-    expected_cc = ComputeCapability(major, minor)
-    assert device.compute_capability == expected_cc
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+try:
+    from cuda.bindings import driver, runtime
+except ImportError:
+    from cuda import cuda as driver
+    from cuda import cudart as runtime
+
+from cuda.core.experimental import Device
+from cuda.core.experimental._utils import ComputeCapability, handle_return
+
+
+def test_device_set_current(deinit_cuda):
+    device = Device()
+    device.set_current()
+    assert handle_return(driver.cuCtxGetCurrent()) is not None
+
+
+def test_device_repr():
+    device = Device(0)
+    assert str(device).startswith("<Device 0")
+
+
+def test_device_alloc(init_cuda):
+    device = Device()
+    buffer = device.allocate(1024)
+    device.sync()
+    assert buffer.handle != 0
+    assert buffer.size == 1024
+    assert buffer.device_id == 0
+
+
+def test_device_create_stream(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    assert stream is not None
+    assert stream.handle
+
+
+def test_pci_bus_id():
+    device = Device()
+    bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, device.device_id))
+    assert device.pci_bus_id == bus_id[:12].decode()
+
+
+def test_uuid():
+    device = Device()
+    driver_ver = handle_return(driver.cuDriverGetVersion())
+    if driver_ver >= 11040:
+        uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id))
+    else:
+        uuid = handle_return(driver.cuDeviceGetUuid(device.device_id))
+    uuid = uuid.bytes.hex()
+    expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+    assert device.uuid == expected_uuid
+
+
+def test_name():
+    device = Device()
+    name = handle_return(driver.cuDeviceGetName(128, device.device_id))
+    name = name.split(b"\0")[0]
+    assert device.name == name.decode()
+
+
+def test_compute_capability():
+    device = Device()
+    major = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)
+    )
+    minor = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)
+    )
+    expected_cc = ComputeCapability(major, minor)
+    assert device.compute_capability == expected_cc
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 21548078..0d650b4f 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -1,46 +1,46 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, EventOptions
-
-
-@pytest.mark.parametrize("enable_timing", [True, False, None])
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True)
-
-
-def test_is_sync_busy_waited(init_cuda):
-    options = EventOptions(enable_timing=False, busy_waited_sync=True)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_sync_busy_waited is True
-
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_sync_busy_waited is False
-
-
-def test_sync(init_cuda):
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    event.sync()
-    assert event.is_done is True
-
-
-def test_is_done(init_cuda):
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_done is True
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, EventOptions
+
+
+@pytest.mark.parametrize("enable_timing", [True, False, None])
+def test_timing(init_cuda, enable_timing):
+    options = EventOptions(enable_timing=enable_timing)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True)
+
+
+def test_is_sync_busy_waited(init_cuda):
+    options = EventOptions(enable_timing=False, busy_waited_sync=True)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_sync_busy_waited is True
+
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_sync_busy_waited is False
+
+
+def test_sync(init_cuda):
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    event.sync()
+    assert event.is_done is True
+
+
+def test_is_done(init_cuda):
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_done is True
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 874d7f07..08f7e6d3 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -1,68 +1,68 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, LaunchConfig, Stream
-
-
-def test_launch_config_init(init_cuda):
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
-    assert config.grid == (1, 1, 1)
-    assert config.block == (1, 1, 1)
-    assert config.stream is None
-    assert config.shmem_size == 0
-
-    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024)
-    assert config.grid == (2, 2, 2)
-    assert config.block == (2, 2, 2)
-    assert isinstance(config.stream, Stream)
-    assert config.shmem_size == 1024
-
-
-def test_launch_config_cast_to_3_tuple():
-    config = LaunchConfig(grid=1, block=1)
-    assert config._cast_to_3_tuple(1) == (1, 1, 1)
-    assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1)
-    assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3)
-
-    # Edge cases
-    assert config._cast_to_3_tuple(999) == (999, 1, 1)
-    assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
-    assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
-
-
-def test_launch_config_invalid_values():
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=0, block=1)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(0, 1), block=1)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=0)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=(0, 1))
-
-
-def test_launch_config_stream(init_cuda):
-    stream = Device().create_stream()
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
-    assert config.stream == stream
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
-
-
-def test_launch_config_shmem_size():
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
-    assert config.shmem_size == 2048
-
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None)
-    assert config.shmem_size == 0
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, LaunchConfig, Stream
+
+
+def test_launch_config_init(init_cuda):
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
+    assert config.grid == (1, 1, 1)
+    assert config.block == (1, 1, 1)
+    assert config.stream is None
+    assert config.shmem_size == 0
+
+    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024)
+    assert config.grid == (2, 2, 2)
+    assert config.block == (2, 2, 2)
+    assert isinstance(config.stream, Stream)
+    assert config.shmem_size == 1024
+
+
+def test_launch_config_cast_to_3_tuple():
+    config = LaunchConfig(grid=1, block=1)
+    assert config._cast_to_3_tuple(1) == (1, 1, 1)
+    assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1)
+    assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3)
+
+    # Edge cases
+    assert config._cast_to_3_tuple(999) == (999, 1, 1)
+    assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
+    assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
+
+
+def test_launch_config_invalid_values():
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=0, block=1)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(0, 1), block=1)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=0)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=(0, 1))
+
+
+def test_launch_config_stream(init_cuda):
+    stream = Device().create_stream()
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
+    assert config.stream == stream
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
+
+
+def test_launch_config_shmem_size():
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
+    assert config.shmem_size == 2048
+
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None)
+    assert config.shmem_size == 0
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index c78b5673..a48db69b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1,213 +1,213 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
-import ctypes
-
-from cuda.core.experimental import Device
-from cuda.core.experimental._memory import Buffer, MemoryResource
-from cuda.core.experimental._utils import handle_return
-
-
-class DummyDeviceMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAlloc(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return False
-
-    @property
-    def device_id(self) -> int:
-        return 0
-
-
-class DummyHostMemoryResource(MemoryResource):
-    def __init__(self):
-        pass
-
-    def allocate(self, size, stream=None) -> Buffer:
-        # Allocate a ctypes buffer of size `size`
-        ptr = (ctypes.c_byte * size)()
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        # the memory is deallocated per the ctypes deallocation at garbage collection time
-        pass
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return False
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
-
-
-class DummyUnifiedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        return 0
-
-
-class DummyPinnedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocHost(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFreeHost(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
-
-
-def buffer_initialization(dummy_mr: MemoryResource):
-    buffer = dummy_mr.allocate(size=1024)
-    assert buffer.handle != 0
-    assert buffer.size == 1024
-    assert buffer.memory_resource == dummy_mr
-    assert buffer.is_device_accessible == dummy_mr.is_device_accessible
-    assert buffer.is_host_accessible == dummy_mr.is_host_accessible
-    buffer.close()
-
-
-def test_buffer_initialization():
-    device = Device()
-    device.set_current()
-    buffer_initialization(DummyDeviceMemoryResource(device))
-    buffer_initialization(DummyHostMemoryResource())
-    buffer_initialization(DummyUnifiedMemoryResource(device))
-    buffer_initialization(DummyPinnedMemoryResource(device))
-
-
-def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
-    src_buffer = dummy_mr.allocate(size=1024)
-    dst_buffer = dummy_mr.allocate(size=1024)
-    stream = device.create_stream()
-
-    if check:
-        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        for i in range(1024):
-            src_ptr[i] = ctypes.c_byte(i)
-
-    src_buffer.copy_to(dst_buffer, stream=stream)
-    device.sync()
-
-    if check:
-        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-
-        for i in range(10):
-            assert dst_ptr[i] == src_ptr[i]
-
-    dst_buffer.close()
-    src_buffer.close()
-
-
-def test_buffer_copy_to():
-    device = Device()
-    device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
-
-
-def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
-    src_buffer = dummy_mr.allocate(size=1024)
-    dst_buffer = dummy_mr.allocate(size=1024)
-    stream = device.create_stream()
-
-    if check:
-        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        for i in range(1024):
-            src_ptr[i] = ctypes.c_byte(i)
-
-    dst_buffer.copy_from(src_buffer, stream=stream)
-    device.sync()
-
-    if check:
-        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-
-        for i in range(10):
-            assert dst_ptr[i] == src_ptr[i]
-
-    dst_buffer.close()
-    src_buffer.close()
-
-
-def test_buffer_copy_from():
-    device = Device()
-    device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
-
-
-def buffer_close(dummy_mr: MemoryResource):
-    buffer = dummy_mr.allocate(size=1024)
-    buffer.close()
-    assert buffer.handle == 0
-    assert buffer.memory_resource is None
-
-
-def test_buffer_close():
-    device = Device()
-    device.set_current()
-    buffer_close(DummyDeviceMemoryResource(device))
-    buffer_close(DummyHostMemoryResource())
-    buffer_close(DummyUnifiedMemoryResource(device))
-    buffer_close(DummyPinnedMemoryResource(device))
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+try:
+    from cuda.bindings import driver
+except ImportError:
+    from cuda import cuda as driver
+
+import ctypes
+
+from cuda.core.experimental import Device
+from cuda.core.experimental._memory import Buffer, MemoryResource
+from cuda.core.experimental._utils import handle_return
+
+
+class DummyDeviceMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAlloc(size))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return 0
+
+
+class DummyHostMemoryResource(MemoryResource):
+    def __init__(self):
+        pass
+
+    def allocate(self, size, stream=None) -> Buffer:
+        # Allocate a ctypes buffer of size `size`
+        ptr = (ctypes.c_byte * size)()
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        # the memory is deallocated per the ctypes deallocation at garbage collection time
+        pass
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return False
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class DummyUnifiedMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        return 0
+
+
+class DummyPinnedMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAllocHost(size))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFreeHost(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+def buffer_initialization(dummy_mr: MemoryResource):
+    buffer = dummy_mr.allocate(size=1024)
+    assert buffer.handle != 0
+    assert buffer.size == 1024
+    assert buffer.memory_resource == dummy_mr
+    assert buffer.is_device_accessible == dummy_mr.is_device_accessible
+    assert buffer.is_host_accessible == dummy_mr.is_host_accessible
+    buffer.close()
+
+
+def test_buffer_initialization():
+    device = Device()
+    device.set_current()
+    buffer_initialization(DummyDeviceMemoryResource(device))
+    buffer_initialization(DummyHostMemoryResource())
+    buffer_initialization(DummyUnifiedMemoryResource(device))
+    buffer_initialization(DummyPinnedMemoryResource(device))
+
+
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
+    src_buffer = dummy_mr.allocate(size=1024)
+    dst_buffer = dummy_mr.allocate(size=1024)
+    stream = device.create_stream()
+
+    if check:
+        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+        for i in range(1024):
+            src_ptr[i] = ctypes.c_byte(i)
+
+    src_buffer.copy_to(dst_buffer, stream=stream)
+    device.sync()
+
+    if check:
+        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+
+        for i in range(10):
+            assert dst_ptr[i] == src_ptr[i]
+
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_buffer_copy_to():
+    device = Device()
+    device.set_current()
+    buffer_copy_to(DummyDeviceMemoryResource(device), device)
+    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+
+
+def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
+    src_buffer = dummy_mr.allocate(size=1024)
+    dst_buffer = dummy_mr.allocate(size=1024)
+    stream = device.create_stream()
+
+    if check:
+        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+        for i in range(1024):
+            src_ptr[i] = ctypes.c_byte(i)
+
+    dst_buffer.copy_from(src_buffer, stream=stream)
+    device.sync()
+
+    if check:
+        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+
+        for i in range(10):
+            assert dst_ptr[i] == src_ptr[i]
+
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_buffer_copy_from():
+    device = Device()
+    device.set_current()
+    buffer_copy_from(DummyDeviceMemoryResource(device), device)
+    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+
+
+def buffer_close(dummy_mr: MemoryResource):
+    buffer = dummy_mr.allocate(size=1024)
+    buffer.close()
+    assert buffer.handle == 0
+    assert buffer.memory_resource is None
+
+
+def test_buffer_close():
+    device = Device()
+    device.set_current()
+    buffer_close(DummyDeviceMemoryResource(device))
+    buffer_close(DummyHostMemoryResource())
+    buffer_close(DummyUnifiedMemoryResource(device))
+    buffer_close(DummyPinnedMemoryResource(device))
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 5f0b6056..a976726f 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -1,48 +1,48 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import importlib
-
-import pytest
-
-from cuda.core.experimental._module import ObjectCode
-
-
-@pytest.mark.skipif(
-    int(importlib.metadata.version("cuda-python").split(".")[0]) < 12,
-    reason="Module loading for older drivers validate require valid module code.",
-)
-def test_object_code_initialization():
-    # Test with supported code types
-    for code_type in ["cubin", "ptx", "fatbin"]:
-        module_data = b"dummy_data"
-        obj_code = ObjectCode(module_data, code_type)
-        assert obj_code._code_type == code_type
-        assert obj_code._module == module_data
-        assert obj_code._handle is not None
-
-    # Test with unsupported code type
-    with pytest.raises(ValueError):
-        ObjectCode(b"dummy_data", "unsupported_code_type")
-
-
-# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
-def test_object_code_initialization_with_str():
-    assert True
-
-
-def test_object_code_initialization_with_jit_options():
-    assert True
-
-
-def test_object_code_get_kernel():
-    assert True
-
-
-def test_kernel_from_obj():
-    assert True
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import importlib
+
+import pytest
+
+from cuda.core.experimental._module import ObjectCode
+
+
+@pytest.mark.skipif(
+    int(importlib.metadata.version("cuda-python").split(".")[0]) < 12,
+    reason="Module loading for older drivers validate require valid module code.",
+)
+def test_object_code_initialization():
+    # Test with supported code types
+    for code_type in ["cubin", "ptx", "fatbin"]:
+        module_data = b"dummy_data"
+        obj_code = ObjectCode(module_data, code_type)
+        assert obj_code._code_type == code_type
+        assert obj_code._module == module_data
+        assert obj_code._handle is not None
+
+    # Test with unsupported code type
+    with pytest.raises(ValueError):
+        ObjectCode(b"dummy_data", "unsupported_code_type")
+
+
+# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
+def test_object_code_initialization_with_str():
+    assert True
+
+
+def test_object_code_initialization_with_jit_options():
+    assert True
+
+
+def test_object_code_get_kernel():
+    assert True
+
+
+def test_kernel_from_obj():
+    assert True
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index af94a7ba..95c4d377 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -1,66 +1,66 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Program
-from cuda.core.experimental._module import Kernel, ObjectCode
-
-
-def test_program_init_valid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-    assert program.handle is not None
-
-
-def test_program_init_invalid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    with pytest.raises(NotImplementedError):
-        Program(code, "python")
-
-
-def test_program_init_invalid_code_format():
-    code = 12345
-    with pytest.raises(TypeError):
-        Program(code, "c++")
-
-
-def test_program_compile_valid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    object_code = program.compile("ptx")
-    kernel = object_code.get_kernel("my_kernel")
-    assert isinstance(object_code, ObjectCode)
-    assert isinstance(kernel, Kernel)
-
-
-def test_program_compile_invalid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    with pytest.raises(NotImplementedError):
-        program.compile("invalid_target")
-
-
-def test_program_backend_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-
-
-def test_program_handle_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.handle is not None
-
-
-def test_program_close():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    program.close()
-    assert program.handle is None
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Program
+from cuda.core.experimental._module import Kernel, ObjectCode
+
+
+def test_program_init_valid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+    assert program.handle is not None
+
+
+def test_program_init_invalid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    with pytest.raises(NotImplementedError):
+        Program(code, "python")
+
+
+def test_program_init_invalid_code_format():
+    code = 12345
+    with pytest.raises(TypeError):
+        Program(code, "c++")
+
+
+def test_program_compile_valid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    object_code = program.compile("ptx")
+    kernel = object_code.get_kernel("my_kernel")
+    assert isinstance(object_code, ObjectCode)
+    assert isinstance(kernel, Kernel)
+
+
+def test_program_compile_invalid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    with pytest.raises(NotImplementedError):
+        program.compile("invalid_target")
+
+
+def test_program_backend_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+
+
+def test_program_handle_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.handle is not None
+
+
+def test_program_close():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    program.close()
+    assert program.handle is None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 03cdd852..9c661192 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -1,115 +1,115 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._event import Event
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
-
-
-def test_stream_init():
-    with pytest.raises(NotImplementedError):
-        Stream()
-
-
-def test_stream_init_with_options(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0))
-    assert stream.is_nonblocking is True
-    assert stream.priority == 0
-
-
-def test_stream_handle(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    assert isinstance(stream.handle, int)
-
-
-def test_stream_is_nonblocking(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(nonblocking=True))
-    assert stream.is_nonblocking is True
-
-
-def test_stream_priority(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(priority=0))
-    assert stream.priority == 0
-    stream = Device().create_stream(options=StreamOptions(priority=-1))
-    assert stream.priority == -1
-    with pytest.raises(ValueError):
-        stream = Device().create_stream(options=StreamOptions(priority=1))
-
-
-def test_stream_sync(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    stream.sync()  # Should not raise any exceptions
-
-
-def test_stream_record(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    event = stream.record()
-    assert isinstance(event, Event)
-
-
-def test_stream_record_invalid_event(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    with pytest.raises(TypeError):
-        stream.record(event="invalid_event")
-
-
-def test_stream_wait_event(init_cuda):
-    s1 = Device().create_stream()
-    s2 = Device().create_stream()
-    e1 = s1.record()
-    s2.wait(e1)  # Should not raise any exceptions
-    s2.sync()
-
-
-def test_stream_wait_invalid_event(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    with pytest.raises(ValueError):
-        stream.wait(event_or_stream="invalid_event")
-
-
-def test_stream_device(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    device = stream.device
-    assert isinstance(device, Device)
-
-
-def test_stream_context(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    context = stream.context
-    assert context is not None
-
-
-def test_stream_from_foreign_stream(init_cuda):
-    device = Device()
-    other_stream = device.create_stream(options=StreamOptions())
-    stream = device.create_stream(obj=other_stream)
-    assert other_stream.handle == stream.handle
-    device = stream.device
-    assert isinstance(device, Device)
-    context = stream.context
-    assert context is not None
-
-
-def test_stream_from_handle():
-    stream = Stream.from_handle(0)
-    assert isinstance(stream, Stream)
-
-
-def test_legacy_default_stream():
-    assert isinstance(LEGACY_DEFAULT_STREAM, Stream)
-
-
-def test_per_thread_default_stream():
-    assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream)
-
-
-def test_default_stream():
-    stream = default_stream()
-    assert isinstance(stream, Stream)
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, Stream, StreamOptions
+from cuda.core.experimental._event import Event
+from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
+
+
+def test_stream_init():
+    with pytest.raises(NotImplementedError):
+        Stream()
+
+
+def test_stream_init_with_options(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0))
+    assert stream.is_nonblocking is True
+    assert stream.priority == 0
+
+
+def test_stream_handle(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    assert isinstance(stream.handle, int)
+
+
+def test_stream_is_nonblocking(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(nonblocking=True))
+    assert stream.is_nonblocking is True
+
+
+def test_stream_priority(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(priority=0))
+    assert stream.priority == 0
+    stream = Device().create_stream(options=StreamOptions(priority=-1))
+    assert stream.priority == -1
+    with pytest.raises(ValueError):
+        stream = Device().create_stream(options=StreamOptions(priority=1))
+
+
+def test_stream_sync(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    stream.sync()  # Should not raise any exceptions
+
+
+def test_stream_record(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    event = stream.record()
+    assert isinstance(event, Event)
+
+
+def test_stream_record_invalid_event(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    with pytest.raises(TypeError):
+        stream.record(event="invalid_event")
+
+
+def test_stream_wait_event(init_cuda):
+    s1 = Device().create_stream()
+    s2 = Device().create_stream()
+    e1 = s1.record()
+    s2.wait(e1)  # Should not raise any exceptions
+    s2.sync()
+
+
+def test_stream_wait_invalid_event(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    with pytest.raises(ValueError):
+        stream.wait(event_or_stream="invalid_event")
+
+
+def test_stream_device(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    device = stream.device
+    assert isinstance(device, Device)
+
+
+def test_stream_context(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    context = stream.context
+    assert context is not None
+
+
+def test_stream_from_foreign_stream(init_cuda):
+    device = Device()
+    other_stream = device.create_stream(options=StreamOptions())
+    stream = device.create_stream(obj=other_stream)
+    assert other_stream.handle == stream.handle
+    device = stream.device
+    assert isinstance(device, Device)
+    context = stream.context
+    assert context is not None
+
+
+def test_stream_from_handle():
+    stream = Stream.from_handle(0)
+    assert isinstance(stream, Stream)
+
+
+def test_legacy_default_stream():
+    assert isinstance(LEGACY_DEFAULT_STREAM, Stream)
+
+
+def test_per_thread_default_stream():
+    assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream)
+
+
+def test_default_stream():
+    stream = default_stream()
+    assert isinstance(stream, Stream)

From 702fbaa550f1b40f14fa35a656bcfc5817b96ff9 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 11:54:23 -0800
Subject: [PATCH 051/111] handle culink and nvjitlink differences in the
 backend and test

---
 cuda_core/cuda/core/experimental/_linker.py | 50 ++++++++------------
 cuda_core/tests/test_linker.py              | 51 +++++++++++++++------
 2 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 79328583..39d6cd27 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -29,7 +29,6 @@ def _lazy_init():
     _driver_ver = handle_return(cuda.cuDriverGetVersion())
     _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
     try:
-        raise ImportError
         from cuda.bindings import nvjitlink
         from cuda.bindings._internal import nvjitlink as inner_nvjitlink
     except ImportError:
@@ -247,7 +246,7 @@ def _init_nvjitlink(self):
             self.formatted_options.append(f"-split-compile={self.split_compile}")
         if self.split_compile_extended is not None:
             self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
-        if self.no_cache is not None:
+        if self.no_cache is True:
             self.formatted_options.append("-no-cache")
 
     def _init_driver(self):
@@ -272,57 +271,46 @@ def _init_driver(self):
             self.formatted_options.append(self.max_register_count)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS)
         if self.time is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_WALL_TIME)
+            raise ValueError("time option is not supported by the driver API")
         if self.verbose is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE)
         if self.link_time_optimization is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO)
         if self.ptx is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
+            raise ValueError("ptx option is not supported by the driver API")
         if self.optimization_level is not None:
             self.formatted_options.append(self.optimization_level)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL)
         if self.debug is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO)
         if self.lineinfo is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
         if self.ftz is not None:
-            self.formatted_options.append(1 if self.ftz else 0)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_FTZ)
+            raise ValueError("ftz option is deprecated in the driver API")
         if self.prec_div is not None:
-            self.formatted_options.append(1 if self.prec_div else 0)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_DIV)
+            raise ValueError("prec_div option is deprecated in the driver API")
         if self.prec_sqrt is not None:
-            self.formatted_options.append(1 if self.prec_sqrt else 0)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_PREC_SQRT)
+            raise ValueError("prec_sqrt option is deprecated in the driver API")
         if self.fma is not None:
-            self.formatted_options.append(1 if self.fma else 0)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_FMA)
+            raise ValueError("fma options is deprecated in the driver API")
         if self.kernels_used is not None:
-            for kernel in self.kernels_used:
-                self.formatted_options.append(kernel.encode())
-                self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_KERNEL_NAMES)
+            raise ValueError("kernels_used is deprecated in the driver API")
         if self.variables_used is not None:
-            for variable in self.variables_used:
-                self.formatted_options.append(variable.encode())
-                self.option_keys.append(_driver.CUjit_option.CU_JIT_REFERENCED_VARIABLE_NAMES)
+            raise ValueError("variables_used is deprecated in the driver API")
         if self.optimize_unused_variables is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES)
+            raise ValueError("optimize_unused_variables is deprecated in the driver API")
         if self.xptxas is not None:
-            for opt in self.xptxas:
-                raise NotImplementedError("TODO: implement xptxas option")
+            raise ValueError("xptxas option is not supported by the driver API")
+        if self.split_compile is not None:
+            raise ValueError("split_compile option is not supported by the driver API")
         if self.split_compile_extended is not None:
-            self.formatted_options.append(self.split_compile_extended)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_MIN_CTA_PER_SM)
+            raise ValueError("split_compile_extended option is not supported by the driver API")
         if self.no_cache is not None:
-            self.formatted_options.append(1)  # ctypes.c_int32(1)
+            self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
 
 
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 3937c878..db9ff657 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -8,6 +8,17 @@
 basic_kernel = "__device__ int B() { return 0; }"
 addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
 
+try:
+    from cuda.bindings import nvjitlink  # noqa F401
+    from cuda.bindings._internal import nvjitlink as inner_nvjitlink
+except ImportError:
+    # binding is not available
+    culink_backend = True
+else:
+    if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
+        # binding is available, but nvJitLink is not installed
+        culink_backend = True
+
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
@@ -27,27 +38,36 @@ def compile_ltoir_functions(init_cuda):
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 
 
+culink_options = [
+    LinkerOptions(arch=ARCH),
+    LinkerOptions(arch=ARCH, max_register_count=32),
+    LinkerOptions(arch=ARCH, verbose=True),
+    LinkerOptions(arch=ARCH, optimization_level=3),
+    LinkerOptions(arch=ARCH, debug=True),
+    LinkerOptions(arch=ARCH, lineinfo=True),
+    LinkerOptions(arch=ARCH, no_cache=True),
+]
+
+
 @pytest.mark.parametrize(
     "options",
-    [
-        LinkerOptions(arch=ARCH),
-        LinkerOptions(arch=ARCH, max_register_count=32),
+    culink_options
+    if culink_backend
+    else culink_options
+    + [
         LinkerOptions(arch=ARCH, time=True),
-        LinkerOptions(arch=ARCH, verbose=True),
-        LinkerOptions(arch=ARCH, optimization_level=3),
-        LinkerOptions(arch=ARCH, debug=True),
-        LinkerOptions(arch=ARCH, lineinfo=True),
         LinkerOptions(arch=ARCH, ftz=True),
         LinkerOptions(arch=ARCH, prec_div=True),
         LinkerOptions(arch=ARCH, prec_sqrt=True),
         LinkerOptions(arch=ARCH, fma=True),
         LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
+        LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]),
         LinkerOptions(arch=ARCH, variables_used=["var1"]),
+        LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
         LinkerOptions(arch=ARCH, optimize_unused_variables=True),
-        # LinkerOptions(arch=ARCH, xptxas=["-v"]),
-        # LinkerOptions(arch=ARCH, split_compile=0),
+        LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        LinkerOptions(arch=ARCH, split_compile=0),
         LinkerOptions(arch=ARCH, split_compile_extended=1),
-        # LinkerOptions(arch=ARCH, no_cache=True),
     ],
 )
 def test_linker_init(compile_ptx_functions, options):
@@ -62,11 +82,12 @@ def test_linker_init_invalid_arch():
         Linker(options)
 
 
-# def test_linker_link_ptx(compile_ltoir_functions):
-#     options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
-#     linker = Linker(*compile_ltoir_functions, options=options)
-#     linked_code = linker.link("ptx")
-#     assert isinstance(linked_code, ObjectCode)
+@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")
+def test_linker_link_ptx(compile_ltoir_functions):
+    options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
+    linker = Linker(*compile_ltoir_functions, options=options)
+    linked_code = linker.link("ptx")
+    assert isinstance(linked_code, ObjectCode)
 
 
 def test_linker_link_cubin(compile_ptx_functions):

From 996ab39a58d1e9495e9ba946527164879fc648f8 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 13:45:05 -0800
Subject: [PATCH 052/111] update line endings

---
 cuda_core/cuda/core/experimental/_linker.py | 888 ++++++++++----------
 cuda_core/tests/test_linker.py              | 240 +++---
 2 files changed, 564 insertions(+), 564 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 39d6cd27..7d95d371 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -1,444 +1,444 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import ctypes
-import weakref
-from dataclasses import dataclass
-from typing import List, Optional
-
-from cuda import cuda
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import check_or_create_options, handle_return
-
-# TODO: revisit this treatment for py313t builds
-_driver = None  # populated if nvJitLink cannot be used
-_driver_input_types = None  # populated if nvJitLink cannot be used
-_driver_ver = None
-_inited = False
-_nvjitlink = None  # populated if nvJitLink can be used
-_nvjitlink_input_types = None  # populated if nvJitLink cannot be used
-
-
-def _lazy_init():
-    global _inited
-    if _inited:
-        return
-
-    global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types
-    _driver_ver = handle_return(cuda.cuDriverGetVersion())
-    _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
-    try:
-        from cuda.bindings import nvjitlink
-        from cuda.bindings._internal import nvjitlink as inner_nvjitlink
-    except ImportError:
-        # binding is not available
-        nvjitlink = None
-    else:
-        if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
-            # binding is available, but nvJitLink is not installed
-            nvjitlink = None
-        elif _driver_ver > nvjitlink.version():
-            # TODO: nvJitLink is not new enough, warn?
-            pass
-    if nvjitlink:
-        _nvjitlink = nvjitlink
-        _nvjitlink_input_types = {
-            "ptx": _nvjitlink.InputType.PTX,
-            "cubin": _nvjitlink.InputType.CUBIN,
-            "fatbin": _nvjitlink.InputType.FATBIN,
-            "ltoir": _nvjitlink.InputType.LTOIR,
-            "object": _nvjitlink.InputType.OBJECT,
-        }
-    else:
-        from cuda import cuda as _driver
-
-        _driver_input_types = {
-            "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX,
-            "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN,
-            "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY,
-            "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT,
-        }
-    _inited = True
-
-
-@dataclass
-class LinkerOptions:
-    """Customizable :obj:`LinkerOptions` for nvJitLink.
-
-    Attributes
-    ----------
-    arch : str
-        Pass SM architecture value. Can use compute_<N> value instead if only generating PTX.
-        This is a required option.
-        Acceptable value type: str
-        Maps to: -arch=sm_<N>
-    max_register_count : int, optional
-        Maximum register count.
-        Default: None
-        Acceptable value type: int
-        Maps to: -maxrregcount=<N>
-    time : bool, optional
-        Print timing information to InfoLog.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -time
-    verbose : bool, optional
-        Print verbose messages to InfoLog.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -verbose
-    link_time_optimization : bool, optional
-        Perform link time optimization.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -lto
-    ptx : bool, optional
-        Emit PTX after linking instead of CUBIN; only supported with -lto.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -ptx
-    optimization_level : int, optional
-        Set optimization level. Only 0 and 3 are accepted.
-        Default: None
-        Acceptable value type: int
-        Maps to: -O<N>
-    debug : bool, optional
-        Generate debug information.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -g
-    lineinfo : bool, optional
-        Generate line information.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -lineinfo
-    ftz : bool, optional
-        Flush denormal values to zero.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -ftz=<n>
-    prec_div : bool, optional
-        Use precise division.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -prec-div=<n>
-    prec_sqrt : bool, optional
-        Use precise square root.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -prec-sqrt=<n>
-    fma : bool, optional
-        Use fast multiply-add.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -fma=<n>
-    kernels_used : List[str], optional
-        Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
-        times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -kernels-used=<name>
-    variables_used : List[str], optional
-        Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple
-        times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -variables-used=<name>
-    optimize_unused_variables : bool, optional
-        Assume that if a variable is not referenced in device code, it can be removed.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -optimize-unused-variables
-    xptxas : List[str], optional
-        Pass options to PTXAS. This option can be called multiple times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -Xptxas=<opt>
-    split_compile : int, optional
-        Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
-        compilation (default).
-        Default: 1
-        Acceptable value type: int
-        Maps to: -split-compile=<N>
-    split_compile_extended : int, optional
-        A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
-        Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
-        option can potentially impact performance of the compiled binary.
-        Default: 1
-        Acceptable value type: int
-        Maps to: -split-compile-extended=<N>
-    no_cache : bool, optional
-        Do not cache the intermediate steps of nvJitLink.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -no-cache
-    """
-
-    arch: str
-    max_register_count: Optional[int] = None
-    time: Optional[bool] = None
-    verbose: Optional[bool] = None
-    link_time_optimization: Optional[bool] = None
-    ptx: Optional[bool] = None
-    optimization_level: Optional[int] = None
-    debug: Optional[bool] = None
-    lineinfo: Optional[bool] = None
-    ftz: Optional[bool] = None
-    prec_div: Optional[bool] = None
-    prec_sqrt: Optional[bool] = None
-    fma: Optional[bool] = None
-    kernels_used: Optional[List[str]] = None
-    variables_used: Optional[List[str]] = None
-    optimize_unused_variables: Optional[bool] = None
-    xptxas: Optional[List[str]] = None
-    split_compile: Optional[int] = None
-    split_compile_extended: Optional[int] = None
-    no_cache: Optional[bool] = None
-
-    def __post_init__(self):
-        _lazy_init()
-        self.formatted_options = []
-        if _nvjitlink:
-            self._init_nvjitlink()
-        else:
-            self._init_driver()
-
-    def _init_nvjitlink(self):
-        if self.arch is not None:
-            self.formatted_options.append(f"-arch={self.arch}")
-        if self.max_register_count is not None:
-            self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
-        if self.time is not None:
-            self.formatted_options.append("-time")
-        if self.verbose is not None:
-            self.formatted_options.append("-verbose")
-        if self.link_time_optimization is not None:
-            self.formatted_options.append("-lto")
-        if self.ptx is not None:
-            self.formatted_options.append("-ptx")
-        if self.optimization_level is not None:
-            self.formatted_options.append(f"-O{self.optimization_level}")
-        if self.debug is not None:
-            self.formatted_options.append("-g")
-        if self.lineinfo is not None:
-            self.formatted_options.append("-lineinfo")
-        if self.ftz is not None:
-            self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}")
-        if self.prec_div is not None:
-            self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}")
-        if self.prec_sqrt is not None:
-            self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}")
-        if self.fma is not None:
-            self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}")
-        if self.kernels_used is not None:
-            for kernel in self.kernels_used:
-                self.formatted_options.append(f"-kernels-used={kernel}")
-        if self.variables_used is not None:
-            for variable in self.variables_used:
-                self.formatted_options.append(f"-variables-used={variable}")
-        if self.optimize_unused_variables is not None:
-            self.formatted_options.append("-optimize-unused-variables")
-        if self.xptxas is not None:
-            for opt in self.xptxas:
-                self.formatted_options.append(f"-Xptxas={opt}")
-        if self.split_compile is not None:
-            self.formatted_options.append(f"-split-compile={self.split_compile}")
-        if self.split_compile_extended is not None:
-            self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
-        if self.no_cache is True:
-            self.formatted_options.append("-no-cache")
-
-    def _init_driver(self):
-        self.option_keys = []
-        # allocate 4 KiB each for info/error logs
-        size = 4194304
-        self.formatted_options.extend((bytearray(size), size, bytearray(size), size))
-        self.option_keys.extend(
-            (
-                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER,
-                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER,
-                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-            )
-        )
-
-        if self.arch is not None:
-            arch = self.arch.split("_")[-1].upper()
-            self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}"))
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET)
-        if self.max_register_count is not None:
-            self.formatted_options.append(self.max_register_count)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS)
-        if self.time is not None:
-            raise ValueError("time option is not supported by the driver API")
-        if self.verbose is not None:
-            self.formatted_options.append(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE)
-        if self.link_time_optimization is not None:
-            self.formatted_options.append(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO)
-        if self.ptx is not None:
-            raise ValueError("ptx option is not supported by the driver API")
-        if self.optimization_level is not None:
-            self.formatted_options.append(self.optimization_level)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL)
-        if self.debug is not None:
-            self.formatted_options.append(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO)
-        if self.lineinfo is not None:
-            self.formatted_options.append(1)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
-        if self.ftz is not None:
-            raise ValueError("ftz option is deprecated in the driver API")
-        if self.prec_div is not None:
-            raise ValueError("prec_div option is deprecated in the driver API")
-        if self.prec_sqrt is not None:
-            raise ValueError("prec_sqrt option is deprecated in the driver API")
-        if self.fma is not None:
-            raise ValueError("fma options is deprecated in the driver API")
-        if self.kernels_used is not None:
-            raise ValueError("kernels_used is deprecated in the driver API")
-        if self.variables_used is not None:
-            raise ValueError("variables_used is deprecated in the driver API")
-        if self.optimize_unused_variables is not None:
-            raise ValueError("optimize_unused_variables is deprecated in the driver API")
-        if self.xptxas is not None:
-            raise ValueError("xptxas option is not supported by the driver API")
-        if self.split_compile is not None:
-            raise ValueError("split_compile option is not supported by the driver API")
-        if self.split_compile_extended is not None:
-            raise ValueError("split_compile_extended option is not supported by the driver API")
-        if self.no_cache is not None:
-            self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE)
-            self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
-
-
-class Linker:
-    """
-    Linker class for managing the linking of object codes with specified options.
-
-    Parameters
-    ----------
-    object_codes : ObjectCode
-        One or more ObjectCode objects to be linked.
-    options : LinkerOptions, optional
-        Options for the linker. If not provided, default options will be used.
-    """
-
-    class _MembersNeededForFinalize:
-        __slots__ = ("handle", "use_nvjitlink")
-
-        def __init__(self, program_obj, handle, use_nvjitlink):
-            self.handle = handle
-            self.use_nvjitlink = use_nvjitlink
-            weakref.finalize(program_obj, self.close)
-
-        def close(self):
-            if self.handle is not None:
-                if self.use_nvjitlink:
-                    _nvjitlink.destroy(self.handle)
-                else:
-                    handle_return(_driver.cuLinkDestroy(self.handle))
-                self.handle = None
-
-    __slots__ = ("__weakref__", "_mnff", "_options")
-
-    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
-        if len(object_codes) == 0:
-            raise ValueError("At least one ObjectCode object must be provided")
-
-        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        if _nvjitlink:
-            handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options)
-            use_nvjitlink = True
-        else:
-            handle = handle_return(
-                _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
-            )
-            use_nvjitlink = False
-        self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
-
-        for code in object_codes:
-            assert isinstance(code, ObjectCode)
-            self._add_code_object(code)
-
-    def _add_code_object(self, object_code: ObjectCode):
-        data = object_code._module
-        assert isinstance(data, bytes)
-        if _nvjitlink:
-            _nvjitlink.add_data(
-                self._mnff.handle,
-                self._input_type_from_code_type(object_code._code_type),
-                data,
-                len(data),
-                f"{object_code._handle}_{object_code._code_type}",
-            )
-        else:
-            handle_return(
-                _driver.cuLinkAddData(
-                    self._mnff.handle,
-                    self._input_type_from_code_type(object_code._code_type),
-                    data,
-                    len(data),
-                    f"{object_code._handle}_{object_code._code_type}".encode(),
-                    0,
-                    None,
-                    None,
-                )
-            )
-
-    def link(self, target_type) -> ObjectCode:
-        if target_type not in ("cubin", "ptx"):
-            raise ValueError(f"Unsupported target type: {target_type}")
-        if _nvjitlink:
-            _nvjitlink.complete(self._mnff.handle)
-            if target_type == "cubin":
-                get_size = _nvjitlink.get_linked_cubin_size
-                get_code = _nvjitlink.get_linked_cubin
-            else:
-                get_size = _nvjitlink.get_linked_ptx_size
-                get_code = _nvjitlink.get_linked_ptx
-
-            size = get_size(self._mnff.handle)
-            code = bytearray(size)
-            get_code(self._mnff.handle, code)
-        else:
-            addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
-            code = (ctypes.c_char * size).from_address(addr)
-
-        return ObjectCode(bytes(code), target_type)
-
-    def get_error_log(self) -> str:
-        if _nvjitlink:
-            log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
-            log = bytearray(log_size)
-            _nvjitlink.get_error_log(self._mnff.handle, log)
-        else:
-            log = self._options.formatted_options[2]
-        return log.decode()
-
-    def get_info_log(self) -> str:
-        if _nvjitlink:
-            log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
-            log = bytearray(log_size)
-            _nvjitlink.get_info_log(self._mnff.handle, log)
-        else:
-            log = self._options.formatted_options[0]
-        return log.decode()
-
-    def _input_type_from_code_type(self, code_type: str):
-        # this list is based on the supported values for code_type in the ObjectCode class definition.
-        # nvJitLink/driver support other options for input type
-        input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type)
-
-        if input_type is None:
-            raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
-        return input_type
-
-    @property
-    def handle(self) -> int:
-        return self._mnff.handle
-
-    def close(self):
-        self._mnff.close()
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import ctypes
+import weakref
+from dataclasses import dataclass
+from typing import List, Optional
+
+from cuda import cuda
+from cuda.core.experimental._module import ObjectCode
+from cuda.core.experimental._utils import check_or_create_options, handle_return
+
+# TODO: revisit this treatment for py313t builds
+_driver = None  # populated if nvJitLink cannot be used
+_driver_input_types = None  # populated if nvJitLink cannot be used
+_driver_ver = None
+_inited = False
+_nvjitlink = None  # populated if nvJitLink can be used
+_nvjitlink_input_types = None  # populated if nvJitLink cannot be used
+
+
+def _lazy_init():
+    global _inited
+    if _inited:
+        return
+
+    global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types
+    _driver_ver = handle_return(cuda.cuDriverGetVersion())
+    _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
+    try:
+        from cuda.bindings import nvjitlink
+        from cuda.bindings._internal import nvjitlink as inner_nvjitlink
+    except ImportError:
+        # binding is not available
+        nvjitlink = None
+    else:
+        if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
+            # binding is available, but nvJitLink is not installed
+            nvjitlink = None
+        elif _driver_ver > nvjitlink.version():
+            # TODO: nvJitLink is not new enough, warn?
+            pass
+    if nvjitlink:
+        _nvjitlink = nvjitlink
+        _nvjitlink_input_types = {
+            "ptx": _nvjitlink.InputType.PTX,
+            "cubin": _nvjitlink.InputType.CUBIN,
+            "fatbin": _nvjitlink.InputType.FATBIN,
+            "ltoir": _nvjitlink.InputType.LTOIR,
+            "object": _nvjitlink.InputType.OBJECT,
+        }
+    else:
+        from cuda import cuda as _driver
+
+        _driver_input_types = {
+            "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX,
+            "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN,
+            "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY,
+            "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT,
+        }
+    _inited = True
+
+
+@dataclass
+class LinkerOptions:
+    """Customizable :obj:`LinkerOptions` for nvJitLink.
+
+    Attributes
+    ----------
+    arch : str
+        Pass SM architecture value. Can use compute_<N> value instead if only generating PTX.
+        This is a required option.
+        Acceptable value type: str
+        Maps to: -arch=sm_<N>
+    max_register_count : int, optional
+        Maximum register count.
+        Default: None
+        Acceptable value type: int
+        Maps to: -maxrregcount=<N>
+    time : bool, optional
+        Print timing information to InfoLog.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -time
+    verbose : bool, optional
+        Print verbose messages to InfoLog.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -verbose
+    link_time_optimization : bool, optional
+        Perform link time optimization.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -lto
+    ptx : bool, optional
+        Emit PTX after linking instead of CUBIN; only supported with -lto.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -ptx
+    optimization_level : int, optional
+        Set optimization level. Only 0 and 3 are accepted.
+        Default: None
+        Acceptable value type: int
+        Maps to: -O<N>
+    debug : bool, optional
+        Generate debug information.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -g
+    lineinfo : bool, optional
+        Generate line information.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -lineinfo
+    ftz : bool, optional
+        Flush denormal values to zero.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -ftz=<n>
+    prec_div : bool, optional
+        Use precise division.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -prec-div=<n>
+    prec_sqrt : bool, optional
+        Use precise square root.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -prec-sqrt=<n>
+    fma : bool, optional
+        Use fast multiply-add.
+        Default: True
+        Acceptable value type: bool
+        Maps to: -fma=<n>
+    kernels_used : List[str], optional
+        Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
+        times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -kernels-used=<name>
+    variables_used : List[str], optional
+        Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple
+        times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -variables-used=<name>
+    optimize_unused_variables : bool, optional
+        Assume that if a variable is not referenced in device code, it can be removed.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -optimize-unused-variables
+    xptxas : List[str], optional
+        Pass options to PTXAS. This option can be called multiple times.
+        Default: None
+        Acceptable value type: list of str
+        Maps to: -Xptxas=<opt>
+    split_compile : int, optional
+        Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
+        compilation (default).
+        Default: 1
+        Acceptable value type: int
+        Maps to: -split-compile=<N>
+    split_compile_extended : int, optional
+        A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
+        Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
+        option can potentially impact performance of the compiled binary.
+        Default: 1
+        Acceptable value type: int
+        Maps to: -split-compile-extended=<N>
+    no_cache : bool, optional
+        Do not cache the intermediate steps of nvJitLink.
+        Default: False
+        Acceptable value type: bool
+        Maps to: -no-cache
+    """
+
+    arch: str
+    max_register_count: Optional[int] = None
+    time: Optional[bool] = None
+    verbose: Optional[bool] = None
+    link_time_optimization: Optional[bool] = None
+    ptx: Optional[bool] = None
+    optimization_level: Optional[int] = None
+    debug: Optional[bool] = None
+    lineinfo: Optional[bool] = None
+    ftz: Optional[bool] = None
+    prec_div: Optional[bool] = None
+    prec_sqrt: Optional[bool] = None
+    fma: Optional[bool] = None
+    kernels_used: Optional[List[str]] = None
+    variables_used: Optional[List[str]] = None
+    optimize_unused_variables: Optional[bool] = None
+    xptxas: Optional[List[str]] = None
+    split_compile: Optional[int] = None
+    split_compile_extended: Optional[int] = None
+    no_cache: Optional[bool] = None
+
+    def __post_init__(self):
+        _lazy_init()
+        self.formatted_options = []
+        if _nvjitlink:
+            self._init_nvjitlink()
+        else:
+            self._init_driver()
+
+    def _init_nvjitlink(self):
+        if self.arch is not None:
+            self.formatted_options.append(f"-arch={self.arch}")
+        if self.max_register_count is not None:
+            self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
+        if self.time is not None:
+            self.formatted_options.append("-time")
+        if self.verbose is not None:
+            self.formatted_options.append("-verbose")
+        if self.link_time_optimization is not None:
+            self.formatted_options.append("-lto")
+        if self.ptx is not None:
+            self.formatted_options.append("-ptx")
+        if self.optimization_level is not None:
+            self.formatted_options.append(f"-O{self.optimization_level}")
+        if self.debug is not None:
+            self.formatted_options.append("-g")
+        if self.lineinfo is not None:
+            self.formatted_options.append("-lineinfo")
+        if self.ftz is not None:
+            self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}")
+        if self.prec_div is not None:
+            self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}")
+        if self.prec_sqrt is not None:
+            self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}")
+        if self.fma is not None:
+            self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}")
+        if self.kernels_used is not None:
+            for kernel in self.kernels_used:
+                self.formatted_options.append(f"-kernels-used={kernel}")
+        if self.variables_used is not None:
+            for variable in self.variables_used:
+                self.formatted_options.append(f"-variables-used={variable}")
+        if self.optimize_unused_variables is not None:
+            self.formatted_options.append("-optimize-unused-variables")
+        if self.xptxas is not None:
+            for opt in self.xptxas:
+                self.formatted_options.append(f"-Xptxas={opt}")
+        if self.split_compile is not None:
+            self.formatted_options.append(f"-split-compile={self.split_compile}")
+        if self.split_compile_extended is not None:
+            self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}")
+        if self.no_cache is True:
+            self.formatted_options.append("-no-cache")
+
+    def _init_driver(self):
+        self.option_keys = []
+        # allocate 4 KiB each for info/error logs
+        size = 4194304
+        self.formatted_options.extend((bytearray(size), size, bytearray(size), size))
+        self.option_keys.extend(
+            (
+                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER,
+                _driver.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER,
+                _driver.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+            )
+        )
+
+        if self.arch is not None:
+            arch = self.arch.split("_")[-1].upper()
+            self.formatted_options.append(getattr(_driver.CUjit_target, f"CU_TARGET_COMPUTE_{arch}"))
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_TARGET)
+        if self.max_register_count is not None:
+            self.formatted_options.append(self.max_register_count)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_MAX_REGISTERS)
+        if self.time is not None:
+            raise ValueError("time option is not supported by the driver API")
+        if self.verbose is not None:
+            self.formatted_options.append(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_LOG_VERBOSE)
+        if self.link_time_optimization is not None:
+            self.formatted_options.append(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_LTO)
+        if self.ptx is not None:
+            raise ValueError("ptx option is not supported by the driver API")
+        if self.optimization_level is not None:
+            self.formatted_options.append(self.optimization_level)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL)
+        if self.debug is not None:
+            self.formatted_options.append(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO)
+        if self.lineinfo is not None:
+            self.formatted_options.append(1)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
+        if self.ftz is not None:
+            raise ValueError("ftz option is deprecated in the driver API")
+        if self.prec_div is not None:
+            raise ValueError("prec_div option is deprecated in the driver API")
+        if self.prec_sqrt is not None:
+            raise ValueError("prec_sqrt option is deprecated in the driver API")
+        if self.fma is not None:
+            raise ValueError("fma options is deprecated in the driver API")
+        if self.kernels_used is not None:
+            raise ValueError("kernels_used is deprecated in the driver API")
+        if self.variables_used is not None:
+            raise ValueError("variables_used is deprecated in the driver API")
+        if self.optimize_unused_variables is not None:
+            raise ValueError("optimize_unused_variables is deprecated in the driver API")
+        if self.xptxas is not None:
+            raise ValueError("xptxas option is not supported by the driver API")
+        if self.split_compile is not None:
+            raise ValueError("split_compile option is not supported by the driver API")
+        if self.split_compile_extended is not None:
+            raise ValueError("split_compile_extended option is not supported by the driver API")
+        if self.no_cache is not None:
+            self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE)
+            self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
+
+
+class Linker:
+    """
+    Linker class for managing the linking of object codes with specified options.
+
+    Parameters
+    ----------
+    object_codes : ObjectCode
+        One or more ObjectCode objects to be linked.
+    options : LinkerOptions, optional
+        Options for the linker. If not provided, default options will be used.
+    """
+
+    class _MembersNeededForFinalize:
+        __slots__ = ("handle", "use_nvjitlink")
+
+        def __init__(self, program_obj, handle, use_nvjitlink):
+            self.handle = handle
+            self.use_nvjitlink = use_nvjitlink
+            weakref.finalize(program_obj, self.close)
+
+        def close(self):
+            if self.handle is not None:
+                if self.use_nvjitlink:
+                    _nvjitlink.destroy(self.handle)
+                else:
+                    handle_return(_driver.cuLinkDestroy(self.handle))
+                self.handle = None
+
+    __slots__ = ("__weakref__", "_mnff", "_options")
+
+    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
+        if len(object_codes) == 0:
+            raise ValueError("At least one ObjectCode object must be provided")
+
+        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
+        if _nvjitlink:
+            handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options)
+            use_nvjitlink = True
+        else:
+            handle = handle_return(
+                _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
+            )
+            use_nvjitlink = False
+        self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
+
+        for code in object_codes:
+            assert isinstance(code, ObjectCode)
+            self._add_code_object(code)
+
+    def _add_code_object(self, object_code: ObjectCode):
+        data = object_code._module
+        assert isinstance(data, bytes)
+        if _nvjitlink:
+            _nvjitlink.add_data(
+                self._mnff.handle,
+                self._input_type_from_code_type(object_code._code_type),
+                data,
+                len(data),
+                f"{object_code._handle}_{object_code._code_type}",
+            )
+        else:
+            handle_return(
+                _driver.cuLinkAddData(
+                    self._mnff.handle,
+                    self._input_type_from_code_type(object_code._code_type),
+                    data,
+                    len(data),
+                    f"{object_code._handle}_{object_code._code_type}".encode(),
+                    0,
+                    None,
+                    None,
+                )
+            )
+
+    def link(self, target_type) -> ObjectCode:
+        if target_type not in ("cubin", "ptx"):
+            raise ValueError(f"Unsupported target type: {target_type}")
+        if _nvjitlink:
+            _nvjitlink.complete(self._mnff.handle)
+            if target_type == "cubin":
+                get_size = _nvjitlink.get_linked_cubin_size
+                get_code = _nvjitlink.get_linked_cubin
+            else:
+                get_size = _nvjitlink.get_linked_ptx_size
+                get_code = _nvjitlink.get_linked_ptx
+
+            size = get_size(self._mnff.handle)
+            code = bytearray(size)
+            get_code(self._mnff.handle, code)
+        else:
+            addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
+            code = (ctypes.c_char * size).from_address(addr)
+
+        return ObjectCode(bytes(code), target_type)
+
+    def get_error_log(self) -> str:
+        if _nvjitlink:
+            log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
+            log = bytearray(log_size)
+            _nvjitlink.get_error_log(self._mnff.handle, log)
+        else:
+            log = self._options.formatted_options[2]
+        return log.decode()
+
+    def get_info_log(self) -> str:
+        if _nvjitlink:
+            log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
+            log = bytearray(log_size)
+            _nvjitlink.get_info_log(self._mnff.handle, log)
+        else:
+            log = self._options.formatted_options[0]
+        return log.decode()
+
+    def _input_type_from_code_type(self, code_type: str):
+        # this list is based on the supported values for code_type in the ObjectCode class definition.
+        # nvJitLink/driver support other options for input type
+        input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type)
+
+        if input_type is None:
+            raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
+        return input_type
+
+    @property
+    def handle(self) -> int:
+        return self._mnff.handle
+
+    def close(self):
+        self._mnff.close()
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index db9ff657..15496b59 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -1,120 +1,120 @@
-import pytest
-
-from cuda.core.experimental import Linker, LinkerOptions, Program
-from cuda.core.experimental._module import ObjectCode
-
-ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
-empty_kernel = "__device__ void A() {}"
-basic_kernel = "__device__ int B() { return 0; }"
-addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
-
-try:
-    from cuda.bindings import nvjitlink  # noqa F401
-    from cuda.bindings._internal import nvjitlink as inner_nvjitlink
-except ImportError:
-    # binding is not available
-    culink_backend = True
-else:
-    if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
-        # binding is available, but nvJitLink is not installed
-        culink_backend = True
-
-
-@pytest.fixture(scope="function")
-def compile_ptx_functions(init_cuda):
-    object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx")
-    object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx")
-    object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx")
-
-    return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
-
-
-@pytest.fixture(scope="function")
-def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
-    object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",))
-    object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",))
-
-    return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
-
-
-culink_options = [
-    LinkerOptions(arch=ARCH),
-    LinkerOptions(arch=ARCH, max_register_count=32),
-    LinkerOptions(arch=ARCH, verbose=True),
-    LinkerOptions(arch=ARCH, optimization_level=3),
-    LinkerOptions(arch=ARCH, debug=True),
-    LinkerOptions(arch=ARCH, lineinfo=True),
-    LinkerOptions(arch=ARCH, no_cache=True),
-]
-
-
-@pytest.mark.parametrize(
-    "options",
-    culink_options
-    if culink_backend
-    else culink_options
-    + [
-        LinkerOptions(arch=ARCH, time=True),
-        LinkerOptions(arch=ARCH, ftz=True),
-        LinkerOptions(arch=ARCH, prec_div=True),
-        LinkerOptions(arch=ARCH, prec_sqrt=True),
-        LinkerOptions(arch=ARCH, fma=True),
-        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
-        LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]),
-        LinkerOptions(arch=ARCH, variables_used=["var1"]),
-        LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
-        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
-        LinkerOptions(arch=ARCH, xptxas=["-v"]),
-        LinkerOptions(arch=ARCH, split_compile=0),
-        LinkerOptions(arch=ARCH, split_compile_extended=1),
-    ],
-)
-def test_linker_init(compile_ptx_functions, options):
-    linker = Linker(*compile_ptx_functions, options=options)
-    object_code = linker.link("cubin")
-    assert isinstance(object_code, ObjectCode)
-
-
-def test_linker_init_invalid_arch():
-    options = LinkerOptions(arch=None)
-    with pytest.raises(TypeError):
-        Linker(options)
-
-
-@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")
-def test_linker_link_ptx(compile_ltoir_functions):
-    options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
-    linker = Linker(*compile_ltoir_functions, options=options)
-    linked_code = linker.link("ptx")
-    assert isinstance(linked_code, ObjectCode)
-
-
-def test_linker_link_cubin(compile_ptx_functions):
-    options = LinkerOptions(arch=ARCH)
-    linker = Linker(*compile_ptx_functions, options=options)
-    linked_code = linker.link("cubin")
-    assert isinstance(linked_code, ObjectCode)
-
-
-def test_linker_link_invalid_target_type(compile_ptx_functions):
-    options = LinkerOptions(arch=ARCH)
-    linker = Linker(*compile_ptx_functions, options=options)
-    with pytest.raises(ValueError):
-        linker.link("invalid_target")
-
-
-def test_linker_get_error_log(compile_ptx_functions):
-    options = LinkerOptions(arch=ARCH)
-    linker = Linker(*compile_ptx_functions, options=options)
-    linker.link("cubin")
-    log = linker.get_error_log()
-    assert isinstance(log, str)
-
-
-def test_linker_get_info_log(compile_ptx_functions):
-    options = LinkerOptions(arch=ARCH)
-    linker = Linker(*compile_ptx_functions, options=options)
-    linker.link("cubin")
-    log = linker.get_info_log()
-    assert isinstance(log, str)
+import pytest
+
+from cuda.core.experimental import Linker, LinkerOptions, Program
+from cuda.core.experimental._module import ObjectCode
+
+ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
+empty_kernel = "__device__ void A() {}"
+basic_kernel = "__device__ int B() { return 0; }"
+addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
+
+try:
+    from cuda.bindings import nvjitlink  # noqa F401
+    from cuda.bindings._internal import nvjitlink as inner_nvjitlink
+except ImportError:
+    # binding is not available
+    culink_backend = True
+else:
+    if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
+        # binding is available, but nvJitLink is not installed
+        culink_backend = True
+
+
+@pytest.fixture(scope="function")
+def compile_ptx_functions(init_cuda):
+    object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx")
+    object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx")
+    object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx")
+
+    return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
+
+
+@pytest.fixture(scope="function")
+def compile_ltoir_functions(init_cuda):
+    object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",))
+
+    return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
+
+
+culink_options = [
+    LinkerOptions(arch=ARCH),
+    LinkerOptions(arch=ARCH, max_register_count=32),
+    LinkerOptions(arch=ARCH, verbose=True),
+    LinkerOptions(arch=ARCH, optimization_level=3),
+    LinkerOptions(arch=ARCH, debug=True),
+    LinkerOptions(arch=ARCH, lineinfo=True),
+    LinkerOptions(arch=ARCH, no_cache=True),
+]
+
+
+@pytest.mark.parametrize(
+    "options",
+    culink_options
+    if culink_backend
+    else culink_options
+    + [
+        LinkerOptions(arch=ARCH, time=True),
+        LinkerOptions(arch=ARCH, ftz=True),
+        LinkerOptions(arch=ARCH, prec_div=True),
+        LinkerOptions(arch=ARCH, prec_sqrt=True),
+        LinkerOptions(arch=ARCH, fma=True),
+        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
+        LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]),
+        LinkerOptions(arch=ARCH, variables_used=["var1"]),
+        LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
+        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
+        LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        LinkerOptions(arch=ARCH, split_compile=0),
+        LinkerOptions(arch=ARCH, split_compile_extended=1),
+    ],
+)
+def test_linker_init(compile_ptx_functions, options):
+    linker = Linker(*compile_ptx_functions, options=options)
+    object_code = linker.link("cubin")
+    assert isinstance(object_code, ObjectCode)
+
+
+def test_linker_init_invalid_arch():
+    options = LinkerOptions(arch=None)
+    with pytest.raises(TypeError):
+        Linker(options)
+
+
+@pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")
+def test_linker_link_ptx(compile_ltoir_functions):
+    options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
+    linker = Linker(*compile_ltoir_functions, options=options)
+    linked_code = linker.link("ptx")
+    assert isinstance(linked_code, ObjectCode)
+
+
+def test_linker_link_cubin(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linked_code = linker.link("cubin")
+    assert isinstance(linked_code, ObjectCode)
+
+
+def test_linker_link_invalid_target_type(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    with pytest.raises(ValueError):
+        linker.link("invalid_target")
+
+
+def test_linker_get_error_log(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linker.link("cubin")
+    log = linker.get_error_log()
+    assert isinstance(log, str)
+
+
+def test_linker_get_info_log(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linker.link("cubin")
+    log = linker.get_info_log()
+    assert isinstance(log, str)

From 8ed625615a6bb6cd700b818620ab923c9a72ce38 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 14:59:17 -0800
Subject: [PATCH 053/111] update the test

---
 cuda_core/cuda/core/experimental/_linker.py |  2 +-
 cuda_core/docs/source/api.rst               |  3 --
 cuda_core/tests/test_linker.py              | 39 ++++++++++++++-------
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 7d95d371..bf232cad 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -309,7 +309,7 @@ def _init_driver(self):
             raise ValueError("split_compile option is not supported by the driver API")
         if self.split_compile_extended is not None:
             raise ValueError("split_compile_extended option is not supported by the driver API")
-        if self.no_cache is not None:
+        if self.no_cache is True:
             self.formatted_options.append(_driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
 
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index a6a34e40..c3e66b52 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -31,13 +31,11 @@ CUDA compilation toolchain
    :toctree: generated/
 
    Program
-<<<<<<< HEAD
    Linker
 
    :template: dataclass.rst
 
    LinkerOptions
-=======
 
 
 .. module:: cuda.core.experimental.utils
@@ -53,4 +51,3 @@ Utility functions
    :template: dataclass.rst
 
    StridedMemoryView
->>>>>>> origin/main
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 15496b59..f5dc33dd 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,10 +4,15 @@
 from cuda.core.experimental._module import ObjectCode
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
-empty_kernel = "__device__ void A() {}"
-basic_kernel = "__device__ int B() { return 0; }"
-addition_kernel = "__device__ int C(int a, int b) { return a + b; }"
-
+device_function_a = """
+__device__ int B();
+__device__ int C(int a, int b);
+__device__ void A() { int result = C(B(), 1);}
+"""
+device_function_b = "__device__ int B() { return 0; }"
+device_function_c = "__device__ int C(int a, int b) { return a + b; }"
+
+culink_backend = False
 try:
     from cuda.bindings import nvjitlink  # noqa F401
     from cuda.bindings._internal import nvjitlink as inner_nvjitlink
@@ -22,18 +27,18 @@
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
-    object_code_a_ptx = Program(empty_kernel, "c++").compile("ptx")
-    object_code_b_ptx = Program(basic_kernel, "c++").compile("ptx")
-    object_code_c_ptx = Program(addition_kernel, "c++").compile("ptx")
+    object_code_b_ptx = Program(device_function_b, "c++").compile("ptx")
+    object_code_c_ptx = Program(device_function_c, "c++").compile("ptx")
+    object_code_a_ptx = Program(device_function_a, "c++").compile("ptx")
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(empty_kernel, "c++").compile("ltoir", options=("-dlto",))
-    object_code_b_ltoir = Program(basic_kernel, "c++").compile("ltoir", options=("-dlto",))
-    object_code_c_ltoir = Program(addition_kernel, "c++").compile("ltoir", options=("-dlto",))
+    object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",))
+    object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",))
+    object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",))
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 
@@ -60,8 +65,8 @@ def compile_ltoir_functions(init_cuda):
         LinkerOptions(arch=ARCH, prec_div=True),
         LinkerOptions(arch=ARCH, prec_sqrt=True),
         LinkerOptions(arch=ARCH, fma=True),
-        LinkerOptions(arch=ARCH, kernels_used=["kernel1"]),
-        LinkerOptions(arch=ARCH, kernels_used=["kernel1", "kernel2"]),
+        LinkerOptions(arch=ARCH, kernels_used=["A"]),
+        LinkerOptions(arch=ARCH, kernels_used=["C", "B"]),
         LinkerOptions(arch=ARCH, variables_used=["var1"]),
         LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
         LinkerOptions(arch=ARCH, optimize_unused_variables=True),
@@ -83,13 +88,21 @@ def test_linker_init_invalid_arch():
 
 
 @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")
-def test_linker_link_ptx(compile_ltoir_functions):
+def test_linker_link_ptx_nvjitlink(compile_ltoir_functions):
     options = LinkerOptions(arch=ARCH, link_time_optimization=True, ptx=True)
     linker = Linker(*compile_ltoir_functions, options=options)
     linked_code = linker.link("ptx")
     assert isinstance(linked_code, ObjectCode)
 
 
+@pytest.mark.skipif(not culink_backend, reason="nvjitlink requires lto for ptx linking")
+def test_linker_link_ptx_culink(compile_ptx_functions):
+    options = LinkerOptions(arch=ARCH)
+    linker = Linker(*compile_ptx_functions, options=options)
+    linked_code = linker.link("ptx")
+    assert isinstance(linked_code, ObjectCode)
+
+
 def test_linker_link_cubin(compile_ptx_functions):
     options = LinkerOptions(arch=ARCH)
     linker = Linker(*compile_ptx_functions, options=options)

From 188ae6223fbb2a8dc567551627483b75230149f8 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 15:01:38 -0800
Subject: [PATCH 054/111] update the test

---
 cuda_core/tests/test_linker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index f5dc33dd..b4008ab6 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,6 +4,8 @@
 from cuda.core.experimental._module import ObjectCode
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
+
+
 device_function_a = """
 __device__ int B();
 __device__ int C(int a, int b);

From 0522d2b71fb682199dfd801f61501aebc030f82e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 15:10:51 -0800
Subject: [PATCH 055/111] update the documentation to touch on LinkerOptions vs
 CUDA version

---
 cuda_core/docs/source/release/0.1.1-notes.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
index 6e491a62..29694f4a 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -6,9 +6,13 @@ Released on Nov <TODO>, 2024
 
 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
-- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API.
+- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API
+  depending on your CUDA version.
 
 ## Limitations
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
   Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, 
+  the backend is the cuLink api which supports only a subset of the options that nvjitlink does.
+  Further, some options aren't available on CUDA versions <12.6

From 06b77e1ef0743fe2e585ef186c5d555a77012132 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Dec 2024 04:20:36 +0000
Subject: [PATCH 056/111] use cpu8 runners for build; remove unnecessary mac
 condition

---
 .github/workflows/gh-build-and-test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 430fbf5b..ffc6f959 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -27,9 +27,11 @@ jobs:
     with:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu16') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') ||
+                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') }}
       build-type: ${{ inputs.build-type }}
-      use-container: ${{ inputs.host-platform == 'linux-x64' || inputs.host-platform == 'linux-aarch64'}}
+      use-container: ${{ inputs.host-platform == 'linux-x64' ||
+                         inputs.host-platform == 'linux-aarch64'}}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}

From 61be96cfce5c64ad0ad96e53fc3bb6e474c136d9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Dec 2024 04:23:16 +0000
Subject: [PATCH 057/111] always require manual CI triggering

---
 .github/copy-pr-bot.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index 895ba83e..2771228b 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -2,3 +2,6 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+# always require manual CI triggering, ignoring signed commits
+auto_sync_draft: false
+auto_sync_ready: false

From df8cbea9727a7438b9013d2c695a8d6a21560537 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 5 Dec 2024 10:38:26 -0800
Subject: [PATCH 058/111] use rdc for nvrtc compilation and improve exception
 reporting by dumping the log

---
 cuda_core/cuda/core/experimental/_linker.py | 45 +++++++++++++++++----
 cuda_core/tests/test_linker.py              | 12 +++---
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index bf232cad..b6c28ba5 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -64,7 +64,10 @@ def _lazy_init():
 
 @dataclass
 class LinkerOptions:
-    """Customizable :obj:`LinkerOptions` for nvJitLink.
+    """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available
+    whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer
+    or older versions of cuda.
+
 
     Attributes
     ----------
@@ -350,11 +353,16 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
 
         self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
         if _nvjitlink:
-            handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options)
+            handle = self._exception_manager(
+                lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options)
+            )
+
             use_nvjitlink = True
         else:
-            handle = handle_return(
-                _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
+            handle = self._exception_manager(
+                lambda: handle_return(
+                    _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
+                )
             )
             use_nvjitlink = False
         self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
@@ -363,6 +371,27 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
             assert isinstance(code, ObjectCode)
             self._add_code_object(code)
 
+    def _exception_manager(self, action):
+        """
+        Helper function to improve the error message of excepotions raised by the linker backend.
+
+        Parameters
+        ----------
+        action : callable
+            The action to be performed.
+
+        Returns
+        -------
+        The return value of the action.
+        """
+        try:
+            return action()
+        except Exception as e:
+            error = self.get_error_log()
+            raise RuntimeError(
+                f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}"
+            ) from e
+
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
@@ -392,7 +421,7 @@ def link(self, target_type) -> ObjectCode:
         if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
         if _nvjitlink:
-            _nvjitlink.complete(self._mnff.handle)
+            self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle))
             if target_type == "cubin":
                 get_size = _nvjitlink.get_linked_cubin_size
                 get_code = _nvjitlink.get_linked_cubin
@@ -400,11 +429,11 @@ def link(self, target_type) -> ObjectCode:
                 get_size = _nvjitlink.get_linked_ptx_size
                 get_code = _nvjitlink.get_linked_ptx
 
-            size = get_size(self._mnff.handle)
+            size = self._exception_manager(lambda: get_size(self._mnff.handle))
             code = bytearray(size)
-            get_code(self._mnff.handle, code)
+            self._exception_manager(lambda: get_code(self._mnff.handle, code))
         else:
-            addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
+            addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle)))
             code = (ctypes.c_char * size).from_address(addr)
 
         return ObjectCode(bytes(code), target_type)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index b4008ab6..6163d9a8 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -9,7 +9,7 @@
 device_function_a = """
 __device__ int B();
 __device__ int C(int a, int b);
-__device__ void A() { int result = C(B(), 1);}
+__global__ void A() { int result = C(B(), 1);}
 """
 device_function_b = "__device__ int B() { return 0; }"
 device_function_c = "__device__ int C(int a, int b) { return a + b; }"
@@ -29,9 +29,11 @@
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
-    object_code_b_ptx = Program(device_function_b, "c++").compile("ptx")
-    object_code_c_ptx = Program(device_function_c, "c++").compile("ptx")
-    object_code_a_ptx = Program(device_function_a, "c++").compile("ptx")
+    # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced
+    # device functions, causing the link to fail
+    object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",))
+    object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",))
+    object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",))
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
@@ -46,7 +48,7 @@ def compile_ltoir_functions(init_cuda):
 
 
 culink_options = [
-    LinkerOptions(arch=ARCH),
+    LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, max_register_count=32),
     LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, optimization_level=3),

From 761bea0b83252519d986d8111e88cad5edaebfcf Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 5 Dec 2024 10:52:32 -0800
Subject: [PATCH 059/111] add note to link()

---
 cuda_core/cuda/core/experimental/_linker.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index b6c28ba5..01c4a0e9 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -418,6 +418,24 @@ def _add_code_object(self, object_code: ObjectCode):
             )
 
     def link(self, target_type) -> ObjectCode:
+        """
+        Links the provided object codes into a single output of the specified target type.
+
+        Parameters
+        ----------
+        target_type : str
+            The type of the target output. Must be either "cubin" or "ptx".
+
+        Returns
+        -------
+        ObjectCode
+            The linked object code of the specified target type.
+
+        Note
+        ------
+        See nvrtc compiler options documnetation to ensure the input ObjectCodes are
+        correctly compiled for linking.
+        """
         if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
         if _nvjitlink:

From b6d73c8ef4efb9ceb97c52f73cc4fb0a60d910c6 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 6 Dec 2024 11:13:14 -0800
Subject: [PATCH 060/111] Update cuda_core/cuda/core/experimental/_memory.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_memory.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 5ff00ba2..12fafb39 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -307,6 +307,9 @@ def allocate(self, size, stream=None) -> Buffer:
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        stream.sync()
         handle_return(cuda.cuMemFree(ptr))
 
     @property

From 1c86afa7bc49a33117f6a0c35c1b9fdb951f3943 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Dec 2024 16:43:23 -0500
Subject: [PATCH 061/111] Full CI support for public builds + switch to use
 cibuildwheel (#267)

* switch to cibuildwheel + some cleanups

* try setting up Python manually

* comment out un-needed code + propagate python-version

* fix: need to check out first

* only build natively; add -v; ensure targeting manylinux

* single quotes

* restore env setup to get artifact dir

* fix artifact dir

* fix artifact name

* restore & fix artifact name again

* build on all platforms!

* fix CIBW_BUILD for windows

* fix typo

* no quotes for wildcard matching

* move CIBW_BUILD logic to script

* fix win runner name

* try to find where pwsh is

* try to escape

* continue hunting..

* try to overwrite shell

* try to install ps

* be explicit about shell (why?)

* only build for win 64 bits

* try to install msvc

* install msvc ourselves

* fix typo

* skip custom cl ver check

* install to standard location

* try to locate Python include path

* switch to public windows runner for now

* windows image does not have sudo

* pwd on Windows Bash does not use Windows path format

* cover all Python versions!

* add quotes

* align the python version installed via GHA vs used at build time

* fix constraint syntax

* check if setup-python is causing interference

* fix typo

* apply a WAR on Linux

* fix unbound var

* detect Python path after it's installed (by CIBW)

* try CIBW_BEFORE_ALL_LINUX

* try to restore the pre-py-span setup...

* reduce build matrix to experiment with cuda.bindings builds

* fix parenthesis

* use abs path

* defer CUDA_PATH setting

* use CIBW_ENVIRONMENT to pass env var

* fetch cuda-profiler-api headers

* only rely on redist

* allow wheel repair to fix the triplet tags

* restore full build matrix!

* fix wget on Windows; pass PARALLEL_LEVEL to CIBW

* switch from wget to curl

* windows needs unzip not tar

* mv -> rsync

* git bash has no wget or rsync...

* ensure win-style path on win

* convert abs path

* debug

* another level down

* check if it is a race condition on win

* clean up unused (but still relevant) code

* clean up unused (but still relevant) code - cont'd

* consolidate with PYTHON_VERSION_FORMATTED
---
 .github/actions/build/action.yml        | 108 ++++++++++++-----------
 .github/actions/setup/action.yml        | 112 +++++++++++++++++++-----
 .github/workflows/ci-gh.yml             |  13 ++-
 .github/workflows/gh-build-and-test.yml |  21 +++--
 .github/workflows/gh-build.yml          |  13 ++-
 5 files changed, 178 insertions(+), 89 deletions(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 952fb9cd..e1552ae8 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -13,73 +13,79 @@ inputs:
   host-platform:
     required: true
     type: string
-  use-container:
-    required: true
-    type: boolean
-  docker-image:
-    type: string
-    required: true
   upload-enabled:
     required: true
     type: boolean
-  python-version:
-    required: true
-    type: string
 
 runs:
   using: composite
   steps:
+    - name: Build cuda.core wheel
+      uses: pypa/cibuildwheel@v2.22.0
+      env:
+        CIBW_BUILD: ${{ env.CIBW_BUILD }}
+        CIBW_ARCHS_LINUX: "native"
+        CIBW_BUILD_VERBOSITY: 1
+        # # ensure Python.h & co can be found
+        # CIBW_BEFORE_BUILD_WINDOWS: >
+        #   python -c "import sysconfig; print(sysconfig.get_path('include'))" >> $env:INCLUDE
+      with:
+        package-dir: ./cuda_core/
+        output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
-    - if: ${{ inputs.use-container }}
-      name: Build (in container)
-      shell: bash --noprofile --norc -xeuo pipefail {0}
-      run: |
-
-        docker run \
-          -e AWS_REGION \
-          -e AWS_SESSION_TOKEN \
-          -e AWS_ACCESS_KEY_ID \
-          -e AWS_SECRET_ACCESS_KEY \
-          -e GITHUB_TOKEN \
-          -e BINDINGS_ARTIFACTS_DIR="$BINDINGS_ARTIFACTS_DIR" \
-          -e CORE_ARTIFACTS_DIR="$CORE_ARTIFACTS_DIR" \
-          -e UPLOAD_ENABLED="$UPLOAD_ENABLED" \
-          -e USE_CUDA="$USE_CUDA" \
-          -e REPO_DIR="$REPO_DIR" \
-          -e LEGATE_CORE_BUILD_MODE="$LEGATE_CORE_BUILD_MODE" \
-          -e PYTHON_VERSION="$PYTHON_VERSION" \
-          -v "${{ env.REPO_DIR }}:${{ env.REPO_DIR }}" \
-          -v "${{ env.BINDINGS_ARTIFACTS_DIR }}:${{ env.BINDINGS_ARTIFACTS_DIR }}" \
-          -v "${{ env.CORE_ARTIFACTS_DIR }}:${{ env.CORE_ARTIFACTS_DIR }}" \
-          --rm "${{ inputs.docker-image }}" \
-          /bin/bash -c "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint ${{ env.REPO_DIR }}/continuous_integration/scripts/build ${{ inputs.build-type}} ${{ inputs.target-device }}"
-
-    - if: ${{ !inputs.use-container }}
-      name: Build (without container)
-      shell: bash --noprofile --norc -xeuo pipefail {0}
-      run: |
-        "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/build" "${{ inputs.build-type}}" "${{ inputs.target-device }}"
-
-    - name: Display structure of the bindings artifacts folder (post build)
+    - name: List the cuda.core artifacts directory
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        sudo chown -R $(whoami) ${{ env.BINDINGS_ARTIFACTS_DIR }}
-        ls -lahR ${{ env.BINDINGS_ARTIFACTS_DIR }}
+        if [[ "${{ inputs.host-platform }}" == win* ]]; then
+          export CHOWN=chown
+        else
+          export CHOWN="sudo chown"
+        fi
+        $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
+        ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
-    - name: Upload bindings build artifacts
+    - name: Upload cuda.core build artifacts
       uses: actions/upload-artifact@v4
       with:
-        name: ${{ env.BINDINGS_ARTIFACT_NAME }}
-        path: ${{ env.BINDINGS_ARTIFACTS_DIR }}
+        name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+        path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+        if-no-files-found: error
+        overwrite: 'true'
+
+    - name: Build cuda.bindings wheel
+      uses: pypa/cibuildwheel@v2.22.0
+      env:
+        CIBW_BUILD: ${{ env.CIBW_BUILD }}
+        CIBW_ARCHS_LINUX: "native"
+        CIBW_BUILD_VERBOSITY: 1
+        CIBW_ENVIRONMENT_LINUX: >
+          CUDA_PATH="$(realpath ./cuda_toolkit)"
+          PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
+        CIBW_ENVIRONMENT_WINDOWS: >
+          CUDA_HOME="$(cygpath -w $(realpath ./cuda_toolkit))"
+        #  PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
+        # # ensure Python.h & co can be found
+        # CIBW_BEFORE_BUILD_WINDOWS: >
+        #   python -c "import sysconfig; print(sysconfig.get_path('include'))" >> $env:INCLUDE
+      with:
+        package-dir: ./cuda_bindings/
+        output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
-    - name: Display structure of the core artifacts folder (post build)
+    - name: List the cuda.bindings artifacts directory
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        sudo chown -R $(whoami) ${{ env.CORE_ARTIFACTS_DIR }}
-        ls -lahR ${{ env.CORE_ARTIFACTS_DIR }}
+        if [[ "${{ inputs.host-platform }}" == win* ]]; then
+          export CHOWN=chown
+        else
+          export CHOWN="sudo chown"
+        fi
+        $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
+        ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
-    - name: Upload core build artifacts
+    - name: Upload cuda.bindings build artifacts
       uses: actions/upload-artifact@v4
       with:
-        name: ${{ env.CORE_ARTIFACT_NAME }}
-        path: ${{ env.CORE_ARTIFACTS_DIR }}
+        name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+        path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
+        if-no-files-found: error
+        overwrite: 'true'
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index c2a8407c..e00cf27f 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -22,34 +22,106 @@ inputs:
   python-version:
     required: true
     type: string
+  cuda-version:
+    required: true
+    type: string
 
 runs:
   using: composite
   steps:
-    - name: Set REPO_DIR and Dump environment
+    # WAR: setup-python is not relocatable...
+    # see https://github.com/actions/setup-python/issues/871
+    - name: Set up Python ${{ inputs.python-version }}
+      if: ${{ startsWith(inputs.host-platform, 'linux') }}
+      id: setup-python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.12"
+
+    - name: Set up MSVC
+      if: ${{ startsWith(inputs.host-platform, 'win') }}
+      uses: ilammy/msvc-dev-cmd@v1
+
+    - name: Dump environment
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "REPO_DIR=$(pwd)" >> $GITHUB_ENV
         env
 
-    - name: Set environment variables
+    - name: Get CUDA components
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
+        CUDA_PATH="./cuda_toolkit"
+        mkdir $CUDA_PATH
 
-        WITH_TESTS_STR=''
-        if [[ ("${{ inputs.upload-enabled }}" == "false") && ("${{ inputs.build-type }}" != "ci") ]]; then
-          WITH_TESTS_STR='-with_tests'
+        # The binary archives (redist) are guaranteed to be updated as part of the release posting.
+        CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
+        CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
+        if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+          if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then
+            CTK_SUBDIR="linux-x86_64"
+          elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
+            CTK_SUBDIR="linux-sbsa"
+          fi
+          function extract() {
+            tar -xvf $1 -C $CUDA_PATH --strip-components=1
+          }
+        elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then
+          CTK_SUBDIR="windows-x86_64"
+          function extract() {
+            _TEMP_DIR_=$(mktemp -d)
+            unzip $1 -d $_TEMP_DIR_
+            cp -r $_TEMP_DIR_/*/* $CUDA_PATH
+            rm -rf $_TEMP_DIR_
+          }
         fi
+        function populate_cuda_path() {
+          # take the component name as a argument
+          function download() {
+            curl -kLSs $1 -o $2
+          }
+          CTK_COMPONENT=$1
+          CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
+              python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
+          CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
+          CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
+          download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME
+          extract $CTK_COMPONENT_COMPONENT_FILENAME
+          rm $CTK_COMPONENT_COMPONENT_FILENAME
+        }
 
-        TARGET_PLATFORM='linux-64'
-        if [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
+        # Get headers and shared libraries in place
+        populate_cuda_path cuda_nvcc
+        populate_cuda_path cuda_cudart
+        populate_cuda_path cuda_nvrtc
+        populate_cuda_path cuda_profiler_api
+        ls -l $CUDA_PATH
+
+        # Note: the headers will be copied into the cibuildwheel manylinux container,
+        # so setting the CUDA_PATH env var here is meaningless.
+
+    - name: Set environment variables
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        # TODO: just align host-platform names with TARGET_PLATFORM...
+        if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then
+          TARGET_PLATFORM='linux-64'
+        elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
           TARGET_PLATFORM='linux-aarch64'
+        elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then
+          TARGET_PLATFORM='win-64'
         fi
 
-        BUILD_MODE="${{ inputs.build-mode }}"
-        BUILD_MODE_STR=""
-        [ -n "${BUILD_MODE}" ] && BUILD_MODE_STR="-${BUILD_MODE}"
+        PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.')
+        if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+          CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-manylinux*"
+          REPO_DIR=$(pwd)
+        elif [[ "${{ inputs.host-platform }}" == win* ]]; then
+          CIBW_BUILD="cp${PYTHON_VERSION_FORMATTED}-win_amd64"
+          PWD=$(pwd)
+          REPO_DIR=$(cygpath -w $PWD)
+        fi
 
+        BUILD_MODE="${{ inputs.build-mode }}"
         if [[ ("${BUILD_MODE}" == "") || ("${BUILD_MODE}" == "release") ]]; then
           # We upload release versions in the default folder.
           PKG_DIR="${TARGET_PLATFORM}"
@@ -57,16 +129,14 @@ runs:
           PKG_DIR="${BUILD_MODE}/${TARGET_PLATFORM}"
         fi
 
-        PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.')
-
-        echo "BINDINGS_ARTIFACT_NAME=${{ inputs.host-platform }}-${{ inputs.build-type }}-cuda_bindings-python${PYTHON_VERSION_FORMATTED}-${{ inputs.target-device }}${BUILD_MODE_STR}${WITH_TESTS_STR}-${{ github.sha }}" >> $GITHUB_ENV
-        echo "BINDINGS_ARTIFACTS_DIR=$(realpath "$(pwd)/cuda_bindings/dist")" >> $GITHUB_ENV
-        echo "CORE_ARTIFACT_NAME=${{ inputs.host-platform }}-${{ inputs.build-type }}-cuda_core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.target-device }}${BUILD_MODE_STR}${WITH_TESTS_STR}-${{ github.sha }}" >> $GITHUB_ENV
-        echo "CORE_ARTIFACTS_DIR=$(realpath "$(pwd)/cuda_core/dist")" >> $GITHUB_ENV
-        echo "USE_CUDA=${{ (inputs.target-device == 'cpu' && 'OFF') || 'ON' }}" >> $GITHUB_ENV
+        echo "PARALLEL_LEVEL=$(nproc)" >> $GITHUB_ENV
+        echo "REPO_DIR=$REPO_DIR" >> $GITHUB_ENV
+        echo "PKG_DIR=${PKG_DIR}" >> $GITHUB_ENV
+        echo "CUDA_CORE_ARTIFACT_NAME=cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.host-platform }}-${{ inputs.build-type }}-${{ github.sha }}" >> $GITHUB_ENV
+        echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV
+        echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ inputs.cuda-version }}-${{ inputs.host-platform }}-${{ inputs.build-type }}-${{ github.sha }}" >> $GITHUB_ENV
+        echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
         echo "UPLOAD_ENABLED=${{ (inputs.upload-enabled  == 'true' && 'ON') || 'OFF' }}" >> $GITHUB_ENV
-        echo "LEGATE_CORE_BUILD_MODE=${BUILD_MODE}" >> $GITHUB_ENV
         echo "BUILD_DATE=$(date +%Y%m%d)" >> $GITHUB_ENV
         echo "TARGET_PLATFORM=${TARGET_PLATFORM}" >> $GITHUB_ENV
-        echo "PKG_DIR=${PKG_DIR}" >> $GITHUB_ENV
-        echo "PYTHON_VERSION=${{ inputs.python-version }}" >> $GITHUB_ENV
+        echo "CIBW_BUILD=${CIBW_BUILD}" >> $GITHUB_ENV
diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index d38cb8e3..1975c3b5 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -18,6 +18,8 @@ jobs:
       matrix:
         host-platform:
           - linux-x64
+          - linux-aarch64
+          - win-x64
         target-device:
           - gpu
         build-mode:
@@ -25,8 +27,14 @@ jobs:
         upload-enabled:
           - false
         python-version:
-          #TODO cover the whole python and cuda matrix
-          - 3.12
+          - "3.12"
+          - "3.11"
+          - "3.10"
+          - "3.9"
+        cuda-version:
+          # Note: this is for build-time only; the test-time matrix needs to be
+          # defined separately.
+          - "12.6.2"
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
@@ -36,4 +44,5 @@ jobs:
       build-type: ci
       upload-enabled: ${{ matrix.upload-enabled }}
       python-version: ${{ matrix.python-version }}
+      cuda-version: ${{ matrix.cuda-version }}
     secrets: inherit
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index ffc6f959..a9a711d4 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -1,16 +1,16 @@
 on:
   workflow_call:
     inputs:
-      host-platform:
+      target-device:
         type: string
         required: true
-      target-device:
+      build-type:
         type: string
         required: true
-      build-mode:
+      host-platform:
         type: string
         required: true
-      build-type:
+      build-mode:
         type: string
         required: true
       upload-enabled:
@@ -19,6 +19,10 @@ on:
       python-version:
         type: string
         required: true
+      cuda-version:
+        type: string
+        required: true
+
 jobs:
   build:
     if: ${{ github.repository_owner == 'nvidia' }}
@@ -28,13 +32,14 @@ jobs:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
       runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') ||
-                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') }}
+                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
+                   (inputs.host-platform == 'win-x64' && 'windows-2019') }}
+                 #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
       build-type: ${{ inputs.build-type }}
-      use-container: ${{ inputs.host-platform == 'linux-x64' ||
-                         inputs.host-platform == 'linux-aarch64'}}
       host-platform: ${{ inputs.host-platform }}
-      dependencies-file: ""
       build-mode: ${{ inputs.build-mode }}
       upload-enabled: ${{ inputs.upload-enabled }}
       python-version: ${{ inputs.python-version }}
+      cuda-version: ${{ inputs.cuda-version }}
+      dependencies-file: ""
     secrets: inherit
diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
index c60e0c2a..7a9f03ce 100644
--- a/.github/workflows/gh-build.yml
+++ b/.github/workflows/gh-build.yml
@@ -16,9 +16,6 @@ on:
         required: true
         type: string
         description: One of ci / release
-      use-container:
-        required: true
-        type: boolean
       host-platform:
         required: true
         type: string
@@ -35,10 +32,13 @@ on:
       python-version:
         required: true
         type: string
+      cuda-version:
+        required: true
+        type: string
 
 jobs:
   build:
-    name: Build (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} )
+    name: Build (${{ inputs.host-platform }}, ${{ inputs.build-type }}, ${{ inputs.build-mode }}, Python "${{ inputs.python-version }}")
 
     permissions:
       id-token: write # This is required for configure-aws-credentials
@@ -52,7 +52,7 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Setup
+      - name: Set up build environment
         uses: ./.github/actions/setup
         with:
           client-repo: ${{ inputs.client-repo }}
@@ -62,6 +62,7 @@ jobs:
           build-mode: ${{ inputs.build-mode }}
           upload-enabled: ${{ inputs.upload-enabled }}
           python-version: ${{ inputs.python-version }}
+          cuda-version: ${{ inputs.cuda-version }}
 
       - name: Call build action
         uses: ./.github/actions/build
@@ -69,6 +70,4 @@ jobs:
           build-type: ${{ inputs.build-type }}
           target-device: "${{ inputs.target-device }}"
           host-platform: ${{ inputs.host-platform }}
-          use-container: ${{ inputs.use-container }}
-          docker-image: "condaforge/miniforge3:latest"
           upload-enabled: ${{ inputs.upload-enabled }}

From 8118f68bb6753e8f6faeb71350244138d04685a5 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Dec 2024 06:15:50 +0000
Subject: [PATCH 062/111] add PY313 build pipelines

---
 .github/workflows/ci-gh.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index 1975c3b5..0b965a52 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -27,6 +27,7 @@ jobs:
         upload-enabled:
           - false
         python-version:
+          - "3.13"
           - "3.12"
           - "3.11"
           - "3.10"

From 9fdbc9fe49cd9eb6a84fdf78326f5db7ea584969 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 6 Dec 2024 15:30:04 -0800
Subject: [PATCH 063/111] remove duplicate test

---
 cuda_core/tests/test_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 6163d9a8..a9b5d1c2 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -50,7 +50,6 @@ def compile_ltoir_functions(init_cuda):
 culink_options = [
     LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, max_register_count=32),
-    LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, optimization_level=3),
     LinkerOptions(arch=ARCH, debug=True),
     LinkerOptions(arch=ARCH, lineinfo=True),

From 677bd6df015834ad5fed6d2cd075623c1935229b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 02:38:20 +0000
Subject: [PATCH 064/111] reuse backend decision logic in tests + some nitpicks

---
 cuda_core/cuda/core/experimental/_linker.py | 36 ++++++++++++++-------
 cuda_core/tests/test_linker.py              | 27 +++++-----------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 01c4a0e9..8cd603d1 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -20,29 +20,43 @@
 _nvjitlink_input_types = None  # populated if nvJitLink cannot be used
 
 
-def _lazy_init():
-    global _inited
-    if _inited:
+# Note: this function is reused in the tests
+def _decide_nvjitlink_or_driver():
+    """Returns True if falling back to the cuLink* driver APIs."""
+    global _driver_ver, _driver, _nvjitlink
+    if _driver or _nvjitlink:
         return
 
-    global _driver, _driver_input_types, _driver_ver, _nvjitlink, _nvjitlink_input_types
     _driver_ver = handle_return(cuda.cuDriverGetVersion())
     _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
     try:
-        from cuda.bindings import nvjitlink
+        from cuda.bindings import nvjitlink as _nvjitlink
         from cuda.bindings._internal import nvjitlink as inner_nvjitlink
     except ImportError:
         # binding is not available
-        nvjitlink = None
+        _nvjitlink = None
     else:
         if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
             # binding is available, but nvJitLink is not installed
-            nvjitlink = None
-        elif _driver_ver > nvjitlink.version():
+            _nvjitlink = None
+
+    if _nvjitlink is None:
+        _driver = cuda
+        return True
+    else:
+        return False
+
+
+def _lazy_init():
+    global _inited, _nvjitlink_input_types, _driver_input_types
+    if _inited:
+        return
+
+    _decide_nvjitlink_or_driver()
+    if _nvjitlink:
+        if _driver_ver > _nvjitlink.version():
             # TODO: nvJitLink is not new enough, warn?
             pass
-    if nvjitlink:
-        _nvjitlink = nvjitlink
         _nvjitlink_input_types = {
             "ptx": _nvjitlink.InputType.PTX,
             "cubin": _nvjitlink.InputType.CUBIN,
@@ -51,8 +65,6 @@ def _lazy_init():
             "object": _nvjitlink.InputType.OBJECT,
         }
     else:
-        from cuda import cuda as _driver
-
         _driver_input_types = {
             "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX,
             "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN,
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index a9b5d1c2..1af746f8 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -1,48 +1,37 @@
 import pytest
 
-from cuda.core.experimental import Linker, LinkerOptions, Program
+from cuda.core.experimental import Linker, LinkerOptions, Program, _linker
 from cuda.core.experimental._module import ObjectCode
 
 ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
 
-
-device_function_a = """
-__device__ int B();
-__device__ int C(int a, int b);
+kernel_a = """
+extern __device__ int B();
+extern __device__ int C(int a, int b);
 __global__ void A() { int result = C(B(), 1);}
 """
 device_function_b = "__device__ int B() { return 0; }"
 device_function_c = "__device__ int C(int a, int b) { return a + b; }"
 
-culink_backend = False
-try:
-    from cuda.bindings import nvjitlink  # noqa F401
-    from cuda.bindings._internal import nvjitlink as inner_nvjitlink
-except ImportError:
-    # binding is not available
-    culink_backend = True
-else:
-    if inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion") == 0:
-        # binding is available, but nvJitLink is not installed
-        culink_backend = True
+culink_backend = _linker._decide_nvjitlink_or_driver()
 
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
-    # Without rdc (relocatable device code) option, the generated ptx will not included any unreferenced
+    # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced
     # device functions, causing the link to fail
+    object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",))
     object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",))
     object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",))
-    object_code_a_ptx = Program(device_function_a, "c++").compile("ptx", options=("-rdc=true",))
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
+    object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",))
     object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",))
     object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",))
-    object_code_a_ltoir = Program(device_function_a, "c++").compile("ltoir", options=("-dlto",))
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 

From 758ae01a7855775b88b91ab7250c34c33225eca8 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 03:24:23 +0000
Subject: [PATCH 065/111] make _exception_manager a ctx mgr

---
 cuda_core/cuda/core/experimental/_linker.py | 88 ++++++++++-----------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 8cd603d1..c27ea94d 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -4,6 +4,7 @@
 
 import ctypes
 import weakref
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import List, Optional
 
@@ -329,6 +330,26 @@ def _init_driver(self):
             self.option_keys.append(_driver.CUjit_option.CU_JIT_CACHE_MODE)
 
 
+# This needs to be a free function not a method, as it's disallowed by contextmanager.
+@contextmanager
+def _exception_manager(self):
+    """
+    A helper function to improve the error message of exceptions raised by the linker backend.
+    """
+    try:
+        yield
+    except Exception as e:
+        error_log = ""
+        if hasattr(self, "_mnff"):
+            # our constructor could raise, in which case there's no handle available
+            error_log = self.get_error_log()
+        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
+        # unfortunately we are still supporting Python 3.9/3.10...
+        # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0].
+        e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:])
+        raise e
+
+
 class Linker:
     """
     Linker class for managing the linking of object codes with specified options.
@@ -364,46 +385,21 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
             raise ValueError("At least one ObjectCode object must be provided")
 
         self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        if _nvjitlink:
-            handle = self._exception_manager(
-                lambda: _nvjitlink.create(len(options.formatted_options), options.formatted_options)
-            )
-
-            use_nvjitlink = True
-        else:
-            handle = self._exception_manager(
-                lambda: handle_return(
+        with _exception_manager(self):
+            if _nvjitlink:
+                handle = _nvjitlink.create(len(options.formatted_options), options.formatted_options)
+                use_nvjitlink = True
+            else:
+                handle = handle_return(
                     _driver.cuLinkCreate(len(options.formatted_options), options.option_keys, options.formatted_options)
                 )
-            )
-            use_nvjitlink = False
+                use_nvjitlink = False
         self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
 
         for code in object_codes:
             assert isinstance(code, ObjectCode)
             self._add_code_object(code)
 
-    def _exception_manager(self, action):
-        """
-        Helper function to improve the error message of excepotions raised by the linker backend.
-
-        Parameters
-        ----------
-        action : callable
-            The action to be performed.
-
-        Returns
-        -------
-        The return value of the action.
-        """
-        try:
-            return action()
-        except Exception as e:
-            error = self.get_error_log()
-            raise RuntimeError(
-                f"Exception raised by {"nvjitlink" if _nvjitlink else "cuLink"}: {e}.\nLinker error log: {error}"
-            ) from e
-
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
@@ -450,21 +446,21 @@ def link(self, target_type) -> ObjectCode:
         """
         if target_type not in ("cubin", "ptx"):
             raise ValueError(f"Unsupported target type: {target_type}")
-        if _nvjitlink:
-            self._exception_manager(lambda: _nvjitlink.complete(self._mnff.handle))
-            if target_type == "cubin":
-                get_size = _nvjitlink.get_linked_cubin_size
-                get_code = _nvjitlink.get_linked_cubin
+        with _exception_manager(self):
+            if _nvjitlink:
+                _nvjitlink.complete(self._mnff.handle)
+                if target_type == "cubin":
+                    get_size = _nvjitlink.get_linked_cubin_size
+                    get_code = _nvjitlink.get_linked_cubin
+                else:
+                    get_size = _nvjitlink.get_linked_ptx_size
+                    get_code = _nvjitlink.get_linked_ptx
+                size = get_size(self._mnff.handle)
+                code = bytearray(size)
+                get_code(self._mnff.handle, code)
             else:
-                get_size = _nvjitlink.get_linked_ptx_size
-                get_code = _nvjitlink.get_linked_ptx
-
-            size = self._exception_manager(lambda: get_size(self._mnff.handle))
-            code = bytearray(size)
-            self._exception_manager(lambda: get_code(self._mnff.handle, code))
-        else:
-            addr, size = self._exception_manager(lambda: handle_return(_driver.cuLinkComplete(self._mnff.handle)))
-            code = (ctypes.c_char * size).from_address(addr)
+                addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
+                code = (ctypes.c_char * size).from_address(addr)
 
         return ObjectCode(bytes(code), target_type)
 

From 06ee1e28e875e4eab869ce530fd00560ec010a8f Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 03:33:48 +0000
Subject: [PATCH 066/111] also guard the add_data calls with _exception_manager
 + add missing docstrings

---
 cuda_core/cuda/core/experimental/_linker.py | 49 ++++++++++++++-------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index c27ea94d..a1f93e18 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -403,27 +403,28 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
         assert isinstance(data, bytes)
-        if _nvjitlink:
-            _nvjitlink.add_data(
-                self._mnff.handle,
-                self._input_type_from_code_type(object_code._code_type),
-                data,
-                len(data),
-                f"{object_code._handle}_{object_code._code_type}",
-            )
-        else:
-            handle_return(
-                _driver.cuLinkAddData(
+        with _exception_manager(self):
+            if _nvjitlink:
+                _nvjitlink.add_data(
                     self._mnff.handle,
                     self._input_type_from_code_type(object_code._code_type),
                     data,
                     len(data),
-                    f"{object_code._handle}_{object_code._code_type}".encode(),
-                    0,
-                    None,
-                    None,
+                    f"{object_code._handle}_{object_code._code_type}",
+                )
+            else:
+                handle_return(
+                    _driver.cuLinkAddData(
+                        self._mnff.handle,
+                        self._input_type_from_code_type(object_code._code_type),
+                        data,
+                        len(data),
+                        f"{object_code._handle}_{object_code._code_type}".encode(),
+                        0,
+                        None,
+                        None,
+                    )
                 )
-            )
 
     def link(self, target_type) -> ObjectCode:
         """
@@ -465,6 +466,12 @@ def link(self, target_type) -> ObjectCode:
         return ObjectCode(bytes(code), target_type)
 
     def get_error_log(self) -> str:
+        """ Get the error log generated by the linker.
+
+        Returns
+        -------
+        The error log.
+        """
         if _nvjitlink:
             log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
             log = bytearray(log_size)
@@ -474,6 +481,12 @@ def get_error_log(self) -> str:
         return log.decode()
 
     def get_info_log(self) -> str:
+        """Get the info log generated by the linker.
+
+        Returns
+        -------
+        The info log.
+        """
         if _nvjitlink:
             log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
             log = bytearray(log_size)
@@ -492,8 +505,10 @@ def _input_type_from_code_type(self, code_type: str):
         return input_type
 
     @property
-    def handle(self) -> int:
+    def handle(self):
+        """Return the linker handle object."""
         return self._mnff.handle
 
     def close(self):
+        """Destroy this linker."""
         self._mnff.close()

From faf4855b46d363715ae75921364464b5117cd9e4 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 03:36:11 +0000
Subject: [PATCH 067/111] add missing license header

---
 cuda_core/tests/test_linker.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 1af746f8..54cd8cf4 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import pytest
 
 from cuda.core.experimental import Linker, LinkerOptions, Program, _linker

From 1c9dea6bfc3cb3112ab065366d1d7832f99478a4 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 04:17:40 +0000
Subject: [PATCH 068/111] improve docs

---
 cuda_core/cuda/core/experimental/_linker.py  | 127 ++++++++-----------
 cuda_core/docs/source/release/0.1.1-notes.md |   9 +-
 2 files changed, 58 insertions(+), 78 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index a1f93e18..09a237a4 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -77,118 +77,92 @@ def _lazy_init():
 
 @dataclass
 class LinkerOptions:
-    """Customizable :obj:`LinkerOptions` for nvJitLink or driver API. Some options are only available
-    whenusing the cuda.bindings.nvjitlink backend. Some options are only available when using newer
-    or older versions of cuda.
+    """Customizable :obj:`Linker` options.
 
+    Since the linker would choose to use nvJitLink or the driver APIs as the linking backed,
+    not all options are applicable.
 
     Attributes
     ----------
     arch : str
-        Pass SM architecture value. Can use compute_<N> value instead if only generating PTX.
+        Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX).
         This is a required option.
-        Acceptable value type: str
-        Maps to: -arch=sm_<N>
     max_register_count : int, optional
         Maximum register count.
-        Default: None
-        Acceptable value type: int
-        Maps to: -maxrregcount=<N>
+        Maps to: ``-maxrregcount=<N>``.
     time : bool, optional
-        Print timing information to InfoLog.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -time
+        Print timing information to the info log.
+        Maps to ``-time``.
+        Default: False.
     verbose : bool, optional
-        Print verbose messages to InfoLog.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -verbose
+        Print verbose messages to the info log.
+        Maps to ``-verbose``.
+        Default: False.
     link_time_optimization : bool, optional
         Perform link time optimization.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -lto
+        Maps to: ``-lto``.
+        Default: False.
     ptx : bool, optional
-        Emit PTX after linking instead of CUBIN; only supported with -lto.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -ptx
+        Emit PTX after linking instead of CUBIN; only supported with ``-lto``.
+        Maps to ``-ptx``.
+        Default: False.
     optimization_level : int, optional
         Set optimization level. Only 0 and 3 are accepted.
-        Default: None
-        Acceptable value type: int
-        Maps to: -O<N>
+        Maps to ``-O<N>``.
     debug : bool, optional
         Generate debug information.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -g
+        Maps to ``-g``
+        Default: False.
     lineinfo : bool, optional
         Generate line information.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -lineinfo
+        Maps to ``-lineinfo``.
+        Default: False.
     ftz : bool, optional
         Flush denormal values to zero.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -ftz=<n>
+        Maps to ``-ftz=<n>``.
+        Default: False.
     prec_div : bool, optional
         Use precise division.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -prec-div=<n>
+        Maps to ``-prec-div=<n>``.
+        Default: True.
     prec_sqrt : bool, optional
         Use precise square root.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -prec-sqrt=<n>
+        Maps to ``-prec-sqrt=<n>``.
+        Default: True.
     fma : bool, optional
         Use fast multiply-add.
-        Default: True
-        Acceptable value type: bool
-        Maps to: -fma=<n>
+        Maps to ``-fma=<n>``.
+        Default: True.
     kernels_used : List[str], optional
         Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
         times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -kernels-used=<name>
+        Maps to ``-kernels-used=<name>``.
     variables_used : List[str], optional
-        Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple
-        times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -variables-used=<name>
+        Pass a list of variables that are used; any not in the list can be removed.
+        Maps to ``-variables-used=<name>``
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -optimize-unused-variables
+        Maps to: ``-optimize-unused-variables``
+        Default: False.
     xptxas : List[str], optional
-        Pass options to PTXAS. This option can be called multiple times.
-        Default: None
-        Acceptable value type: list of str
-        Maps to: -Xptxas=<opt>
+        Pass options to PTXAS.
+        Maps to: ``-Xptxas=<opt>``.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
         compilation (default).
-        Default: 1
-        Acceptable value type: int
-        Maps to: -split-compile=<N>
+        Maps to ``-split-compile=<N>``.
+        Default: 1.
     split_compile_extended : int, optional
         A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
         Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
         option can potentially impact performance of the compiled binary.
-        Default: 1
-        Acceptable value type: int
-        Maps to: -split-compile-extended=<N>
+        Maps to ``-split-compile-extended=<N>``.
+        Default: 1.
     no_cache : bool, optional
         Do not cache the intermediate steps of nvJitLink.
-        Default: False
-        Acceptable value type: bool
-        Maps to: -no-cache
+        Maps to ``-no-cache``.
+        Default: False.
     """
 
     arch: str
@@ -351,8 +325,11 @@ def _exception_manager(self):
 
 
 class Linker:
-    """
-    Linker class for managing the linking of object codes with specified options.
+    """Represent a linking machinery to link one or multiple object codes into
+    :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options.
+
+    This object provides a unified interface to multiple underlying
+    linker libraries (such as nvJitLink or cuLink* from CUDA driver).
 
     Parameters
     ----------
@@ -442,7 +419,7 @@ def link(self, target_type) -> ObjectCode:
 
         Note
         ------
-        See nvrtc compiler options documnetation to ensure the input ObjectCodes are
+        See nvrtc compiler options documnetation to ensure the input object codes are
         correctly compiled for linking.
         """
         if target_type not in ("cubin", "ptx"):
@@ -470,7 +447,8 @@ def get_error_log(self) -> str:
 
         Returns
         -------
-        The error log.
+        str
+            The error log.
         """
         if _nvjitlink:
             log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
@@ -485,7 +463,8 @@ def get_info_log(self) -> str:
 
         Returns
         -------
-        The info log.
+        str
+            The info log.
         """
         if _nvjitlink:
             log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
index cd3530b9..34cad7d1 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -1,13 +1,14 @@
 # `cuda.core` Release notes
 
-Released on Nov <TODO>, 2024
+Released on Dec XX, 2024
 
 ## Hightlights
 
 - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
   implementation of DLPack & CUDA Array Interface supports.
-- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink or cuLink API
-  depending on your CUDA version.
+- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`s. Under
+  the hood, it uses either the nvJitLink or cuLink APIs depending on the CUDA version detected
+  in the current environment.
 - Support TCC devices with a default synchronous memory resource to avoid the use of memory pools
 
 
@@ -15,6 +16,6 @@ Released on Nov <TODO>, 2024
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
   Please kindly share your feedbacks with us so that we can make `cuda.core` better!
-- Some LinkerOptions are only available when using a modern version of CUDA. When using CUDA <12, 
+- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
   the backend is the cuLink api which supports only a subset of the options that nvjitlink does.
   Further, some options aren't available on CUDA versions <12.6

From 1a3f1e64f55b2f0f4840ed547c31efa57c775c97 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 06:22:27 +0000
Subject: [PATCH 069/111] fix docs

---
 cuda_core/cuda/core/experimental/__init__.py |   6 +-
 cuda_core/cuda/core/experimental/_linker.py  |   2 +-
 cuda_core/cuda/core/experimental/_system.py  | 139 +++++++++----------
 cuda_core/docs/source/api.rst                |  12 +-
 cuda_core/docs/source/conf.py                |  17 +++
 cuda_core/docs/source/release.md             |   1 -
 cuda_core/tests/test_system.py               |  71 +++++-----
 7 files changed, 138 insertions(+), 110 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 982226c7..15df70bb 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -9,4 +9,8 @@
 from cuda.core.experimental._linker import Linker, LinkerOptions
 from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
-from cuda.core.experimental._system import system
+from cuda.core.experimental._system import System
+
+system = System()
+__import__("sys").modules[__spec__.name + ".system"] = system
+del System
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 09a237a4..2beeb168 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -443,7 +443,7 @@ def link(self, target_type) -> ObjectCode:
         return ObjectCode(bytes(code), target_type)
 
     def get_error_log(self) -> str:
-        """ Get the error log generated by the linker.
+        """Get the error log generated by the linker.
 
         Returns
         -------
diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
index 258f9bcd..31c7af6f 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -1,72 +1,67 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-from typing import Tuple
-
-from cuda import cuda, cudart
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils import handle_return
-
-
-class System:
-    """ Provide information about the cuda system.
-    This class is a singleton and should not be instantiated directly.
-    """
-
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-
-    def __init__(self):
-        if hasattr(self, '_initialized') and self._initialized:
-            return
-        self._initialized = True
-
-    @property
-    def driver_version(self) -> Tuple[int, int]:
-        """
-        Query the CUDA driver version.
-
-        Returns
-        -------
-        tuple of int
-            A 2-tuple of (major, minor) version numbers.
-        """
-        version = handle_return(cuda.cuDriverGetVersion())
-        major = version // 1000
-        minor = (version % 1000) // 10
-        return (major, minor)
-
-    @property
-    def num_devices(self) -> int:
-        """
-        Query the number of available GPUs.
-
-        Returns
-        -------
-        int
-            The number of available GPU devices.
-        """
-        return handle_return(cudart.cudaGetDeviceCount())
-
-    @property
-    def devices(self) -> tuple:
-        """
-        Query the available device instances.
-
-        Returns
-        -------
-        tuple of Device
-            A tuple containing instances of available devices.
-        """
-        total = self.num_devices
-        return tuple(Device(device_id) for device_id in range(total))
-
-system = System()
-system.__doc__ = """
-Singleton instance of the :obj:`_system.System` class.
-"""
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from typing import Tuple
+
+from cuda import cuda, cudart
+from cuda.core.experimental._device import Device
+from cuda.core.experimental._utils import handle_return
+
+
+class System:
+    """Provide information about the cuda system.
+    This class is a singleton and should not be instantiated directly.
+    """
+
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        if hasattr(self, "_initialized") and self._initialized:
+            return
+        self._initialized = True
+
+    @property
+    def driver_version(self) -> Tuple[int, int]:
+        """
+        Query the CUDA driver version.
+
+        Returns
+        -------
+        tuple of int
+            A 2-tuple of (major, minor) version numbers.
+        """
+        version = handle_return(cuda.cuDriverGetVersion())
+        major = version // 1000
+        minor = (version % 1000) // 10
+        return (major, minor)
+
+    @property
+    def num_devices(self) -> int:
+        """
+        Query the number of available GPUs.
+
+        Returns
+        -------
+        int
+            The number of available GPU devices.
+        """
+        return handle_return(cudart.cudaGetDeviceCount())
+
+    @property
+    def devices(self) -> tuple:
+        """
+        Query the available device instances.
+
+        Returns
+        -------
+        tuple of Device
+            A tuple containing instances of available devices.
+        """
+        total = self.num_devices
+        return tuple(Device(device_id) for device_id in range(total))
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index bd63f0f0..4b30c6ef 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -16,7 +16,6 @@ CUDA runtime
 
    Device
    launch
-   system
 
    :template: dataclass.rst
 
@@ -39,6 +38,17 @@ CUDA compilation toolchain
    LinkerOptions
 
 
+CUDA system information
+-----------------------
+
+.. autodata:: cuda.core.experimental.system.driver_version
+   :no-value:
+.. autodata:: cuda.core.experimental.system.num_devices
+   :no-value:
+.. autodata:: cuda.core.experimental.system.devices
+   :no-value:
+
+
 .. module:: cuda.core.experimental.utils
 
 Utility functions
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 4621e887..3a7afc09 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -91,3 +91,20 @@
 
 napoleon_google_docstring = False
 napoleon_numpy_docstring = True
+
+
+def autodoc_process_docstring(app, what, name, obj, options, lines):
+    if name.startswith("cuda.core.experimental.system"):
+        # patch the docstring (in lines) *in-place*
+        attr = name.split(".")[-1]
+        from cuda.core.experimental._system import System
+
+        lines_new = getattr(System, attr).__doc__.split("\n")
+        n_pops = len(lines)
+        lines.extend(lines_new)
+        for _ in range(n_pops):
+            lines.pop(0)
+
+
+def setup(app):
+    app.connect("autodoc-process-docstring", autodoc_process_docstring)
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 11accb59..a9e16d6e 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -7,6 +7,5 @@ maxdepth: 3
 
     0.1.1 <release/0.1.1-notes>
     0.1.0 <release/0.1.0-notes>
-    0.1.1 <release/0.1.1-notes>
 
 ```
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index 893d1206..7a39388f 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -1,34 +1,37 @@
-try:
-    from cuda.bindings import driver, runtime
-except ImportError:
-    from cuda import cuda as driver
-    from cuda import cudart as runtime
-
-from cuda.core.experimental import Device, system
-from cuda.core.experimental._utils import handle_return
-
-
-def test_system_singleton():
-    system1 = system
-    system2 = system
-    assert id(system1) == id(system2), "system is not a singleton"
-
-def test_driver_version():
-    driver_version = system.driver_version
-    print(driver_version)
-    version = handle_return(driver.cuDriverGetVersion())
-    expected_driver_version = (version // 1000, (version % 1000) // 10)
-    assert driver_version == expected_driver_version, "Driver version does not match expected value"
-
-def test_num_devices():
-    num_devices = system.num_devices
-    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
-    assert num_devices == expected_num_devices, "Number of devices does not match expected value"
-
-def test_devices():
-    devices = system.devices
-    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
-    expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
-    assert len(devices) == len(expected_devices), "Number of devices does not match expected value"
-    for device, expected_device in zip(devices, expected_devices):
-        assert device.device_id == expected_device.device_id, "Device ID does not match expected value"
+try:
+    from cuda.bindings import driver, runtime
+except ImportError:
+    from cuda import cuda as driver
+    from cuda import cudart as runtime
+
+from cuda.core.experimental import Device, system
+from cuda.core.experimental._utils import handle_return
+
+
+def test_system_singleton():
+    system1 = system
+    system2 = system
+    assert id(system1) == id(system2), "system is not a singleton"
+
+
+def test_driver_version():
+    driver_version = system.driver_version
+    print(driver_version)
+    version = handle_return(driver.cuDriverGetVersion())
+    expected_driver_version = (version // 1000, (version % 1000) // 10)
+    assert driver_version == expected_driver_version, "Driver version does not match expected value"
+
+
+def test_num_devices():
+    num_devices = system.num_devices
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    assert num_devices == expected_num_devices, "Number of devices does not match expected value"
+
+
+def test_devices():
+    devices = system.devices
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
+    assert len(devices) == len(expected_devices), "Number of devices does not match expected value"
+    for device, expected_device in zip(devices, expected_devices):
+        assert device.device_id == expected_device.device_id, "Device ID does not match expected value"

From aeebaf757808cad15ed9c241e011457a9bfa97e4 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 15:11:53 -0500
Subject: [PATCH 070/111] skip testing on win; remove mac

---
 .github/workflows/gh-build-and-test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 913b17fd..2bd7ec97 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -45,7 +45,8 @@ jobs:
     secrets: inherit
 
   test:
-    if: ${{ github.repository_owner == 'nvidia' }}
+    if: ${{ (github.repository_owner == 'nvidia') &&
+            !startsWith(inputs.host-platform, 'win) }}
     needs:
       - build
     uses:
@@ -54,7 +55,8 @@ jobs:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
       test-options: ${{ inputs.build-type }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') || (inputs.host-platform == 'mac' && 'macos-latest') }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') ||
+                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') }}
       runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
       build-type: ${{ inputs.build-type }}
       host-platform: ${{ inputs.host-platform }}

From 76a8822fb5df845bb3a0e15c8407c651dc9a2e89 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 15:15:06 -0500
Subject: [PATCH 071/111] fix typo

---
 .github/workflows/gh-build-and-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 2bd7ec97..ab32a62a 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -46,7 +46,7 @@ jobs:
 
   test:
     if: ${{ (github.repository_owner == 'nvidia') &&
-            !startsWith(inputs.host-platform, 'win) }}
+            !startsWith(inputs.host-platform, 'win') }}
     needs:
       - build
     uses:

From c23467f24badb0424648b6c9ee89c5913bdd570a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 15:52:45 -0500
Subject: [PATCH 072/111] skip setup if build stage was called

---
 .github/workflows/gh-test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
index 74f1c520..01eae085 100644
--- a/.github/workflows/gh-test.yml
+++ b/.github/workflows/gh-test.yml
@@ -39,6 +39,10 @@ on:
       python-version:
         required: false
         type: string
+      has-built:
+        required: false
+        type: boolean
+        description: whether the built stage was launched (and passed)
 
 jobs:
   build:
@@ -63,6 +67,7 @@ jobs:
           fetch-depth: 0
 
       - name: Setup
+        if: ${{ !inputs.has-built }}
         uses: ./.github/actions/setup
         with:
           client-repo: ${{ inputs.client-repo }}

From 3d892ad9546187edf8cbe23386e1cdf1ac16ac48 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 16:05:25 -0500
Subject: [PATCH 073/111] set build output

---
 .github/actions/build/action.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index e1552ae8..48c4e50a 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -16,6 +16,10 @@ inputs:
   upload-enabled:
     required: true
     type: boolean
+outputs:
+  has-built:
+    value: true  # TODO: we might need to check the job success here
+    description: whether the built stage was launched (and passed)
 
 runs:
   using: composite

From ae0a994416edb6d63f58fce1c2e41ac32a6c44ea Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 16:09:35 -0500
Subject: [PATCH 074/111] pass output from build to test

---
 .github/workflows/gh-build-and-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index ab32a62a..f9606ba7 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -64,4 +64,5 @@ jobs:
       build-mode: ${{ inputs.build-mode }}
       upload-enabled: ${{ inputs.upload-enabled }}
       python-version: ${{ inputs.python-version }}
+      has-built: ${{ needs.build.outputs.has-built }}
     secrets: inherit

From c3fe6a14881d4bc1b968e1394ca6e05ca4009380 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 16:25:03 -0500
Subject: [PATCH 075/111] wrong place

---
 .github/actions/build/action.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml
index 48c4e50a..e1552ae8 100644
--- a/.github/actions/build/action.yml
+++ b/.github/actions/build/action.yml
@@ -16,10 +16,6 @@ inputs:
   upload-enabled:
     required: true
     type: boolean
-outputs:
-  has-built:
-    value: true  # TODO: we might need to check the job success here
-    description: whether the built stage was launched (and passed)
 
 runs:
   using: composite

From 0bbc706c5fe61e59bda8bfd0091fd29484203b38 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 16:26:18 -0500
Subject: [PATCH 076/111] it's the build workflow, not action, that should have
 outputs

---
 .github/workflows/gh-build.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
index 7a9f03ce..c83fb00a 100644
--- a/.github/workflows/gh-build.yml
+++ b/.github/workflows/gh-build.yml
@@ -35,6 +35,10 @@ on:
       cuda-version:
         required: true
         type: string
+  outputs:
+    has-built:
+      value: true  # TODO: we might need to check the job success here
+      description: whether the built stage was launched (and passed)
 
 jobs:
   build:

From ba0bbdedcdaed83b182af9f62489cd84226e1f5c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 16:28:47 -0500
Subject: [PATCH 077/111] fix indentation

---
 .github/workflows/gh-build.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
index c83fb00a..86b77516 100644
--- a/.github/workflows/gh-build.yml
+++ b/.github/workflows/gh-build.yml
@@ -35,10 +35,10 @@ on:
       cuda-version:
         required: true
         type: string
-  outputs:
-    has-built:
-      value: true  # TODO: we might need to check the job success here
-      description: whether the built stage was launched (and passed)
+    outputs:
+      has-built:
+        value: true  # TODO: we might need to check the job success here
+        description: whether the built stage was launched (and passed)
 
 jobs:
   build:

From aed5bb6b1aa04c15633d6f68c841ee98b0294162 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 18:02:09 -0500
Subject: [PATCH 078/111] try to take output as a string

---
 .github/workflows/gh-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
index 01eae085..b0b94514 100644
--- a/.github/workflows/gh-test.yml
+++ b/.github/workflows/gh-test.yml
@@ -41,7 +41,7 @@ on:
         type: string
       has-built:
         required: false
-        type: boolean
+        type: string
         description: whether the built stage was launched (and passed)
 
 jobs:
@@ -67,7 +67,7 @@ jobs:
           fetch-depth: 0
 
       - name: Setup
-        if: ${{ !inputs.has-built }}
+        if: ${{ inputs.has-built == 'true' }}
         uses: ./.github/actions/setup
         with:
           client-repo: ${{ inputs.client-repo }}

From 330251d90c003df945ac4725df1cedb6b3f257e2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 18:12:52 -0500
Subject: [PATCH 079/111] fix logic

---
 .github/workflows/gh-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
index b0b94514..575ccada 100644
--- a/.github/workflows/gh-test.yml
+++ b/.github/workflows/gh-test.yml
@@ -67,7 +67,7 @@ jobs:
           fetch-depth: 0
 
       - name: Setup
-        if: ${{ inputs.has-built == 'true' }}
+        if: ${{ inputs.has-built != 'true' }}
         uses: ./.github/actions/setup
         with:
           client-repo: ${{ inputs.client-repo }}

From 076104bbb52e6c3b06c5eb3ac00cfb8ecf6235c7 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 23:36:56 +0000
Subject: [PATCH 080/111] multiple fixes

- fix artifact env vars
- runner must have GPUs for testing
- shorten workflow names
---
 .github/actions/test/action.yml         | 19 +++++++------------
 .github/workflows/ci-gh.yml             |  3 +--
 .github/workflows/gh-build-and-test.yml |  9 +++++----
 .github/workflows/gh-build.yml          |  2 +-
 .github/workflows/gh-test.yml           |  7 +------
 5 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 018db9aa..675263fb 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -6,42 +6,37 @@ inputs:
   test-options:
     required: true
     type: string
-  runner-has-gpu:
-    required: true
-    type: boolean
-    description: "The runner has GPU(s)."
 
 runs:
   using: composite
   steps:
-    - if: ${{ inputs.runner-has-gpu == true }}
-      name: Run nvidia-smi to make sure GPU is working
+    - name: Run nvidia-smi to make sure GPU is working
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: nvidia-smi
 
     - name: Download bindings build artifacts
       uses: actions/download-artifact@v4
       with:
-        name: ${{ env.BINDINGS_ARTIFACT_NAME }}
-        path: ${{ env.BINDINGS_ARTIFACTS_DIR }}
+        name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+        path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
     - name: Display structure of downloaded bindings artifacts
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         pwd
-        ls -lahR $BINDINGS_ARTIFACTS_DIR
+        ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR
 
     - name: Download core build artifacts
       uses: actions/download-artifact@v4
       with:
-        name: ${{ env.CORE_ARTIFACT_NAME }}
-        path: ${{ env.CORE_ARTIFACTS_DIR }}
+        name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+        path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
     - name: Display structure of downloaded core build artifacts
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         pwd
-        ls -lahR $CORE_ARTIFACTS_DIR
+        ls -lahR $CUDA_CORE_ARTIFACTS_DIR
 
     - name: Run test / analysis
       shell: bash --noprofile --norc -xeuo pipefail {0}
diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index 1975c3b5..7c493505 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -11,8 +11,7 @@ on:
       - "main"
 
 jobs:
-  build-and-test:
-    name: Build and test (${{ matrix.host-platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }})
+  ci:
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index f9606ba7..0f10fcff 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -25,6 +25,7 @@ on:
 
 jobs:
   build:
+    name: Build wheels
     if: ${{ github.repository_owner == 'nvidia' }}
     uses:
       ./.github/workflows/gh-build.yml
@@ -45,8 +46,10 @@ jobs:
     secrets: inherit
 
   test:
+    name: Test against wheels
+    # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up
     if: ${{ (github.repository_owner == 'nvidia') &&
-            !startsWith(inputs.host-platform, 'win') }}
+             startsWith(inputs.host-platform, 'linux-x64') }}
     needs:
       - build
     uses:
@@ -55,9 +58,7 @@ jobs:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
       test-options: ${{ inputs.build-type }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') ||
-                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu16') }}
-      runner-has-gpu: ${{ inputs.host-platform == 'linux-x64' }}
+      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
       build-type: ${{ inputs.build-type }}
       host-platform: ${{ inputs.host-platform }}
       dependencies-file: ""
diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
index 86b77516..46026ba9 100644
--- a/.github/workflows/gh-build.yml
+++ b/.github/workflows/gh-build.yml
@@ -42,7 +42,7 @@ on:
 
 jobs:
   build:
-    name: Build (${{ inputs.host-platform }}, ${{ inputs.build-type }}, ${{ inputs.build-mode }}, Python "${{ inputs.python-version }}")
+    name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}")
 
     permissions:
       id-token: write # This is required for configure-aws-credentials
diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
index 575ccada..8216ce10 100644
--- a/.github/workflows/gh-test.yml
+++ b/.github/workflows/gh-test.yml
@@ -15,10 +15,6 @@ on:
       runs-on:
         required: true
         type: string
-      runner-has-gpu:
-        required: true
-        type: boolean
-        description: "The runner has GPU(s)."
       build-type:
         required: true
         type: string
@@ -46,7 +42,7 @@ on:
 
 jobs:
   build:
-    name: Test (${{ inputs.host-platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, CMake build-mode=${{ inputs.build-mode }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} )
+    name: Test (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} )
 
     permissions:
       id-token: write # This is required for configure-aws-credentials
@@ -82,4 +78,3 @@ jobs:
         uses: ./.github/actions/test
         with:
           test-options: ${{ inputs.test-options }}
-          runner-has-gpu: ${{ inputs.runner-has-gpu }}

From 7c6fba04dc68313cca9f84cece8b588db166200a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 00:41:48 +0000
Subject: [PATCH 081/111] merge build & test workflows to allow passing env
 vars; further simplify job names

---
 .github/workflows/ci-gh.yml             |   3 +-
 .github/workflows/gh-build-and-test.yml | 101 ++++++++++++++++--------
 .github/workflows/gh-build.yml          |  77 ------------------
 .github/workflows/gh-test.yml           |  80 -------------------
 4 files changed, 69 insertions(+), 192 deletions(-)
 delete mode 100644 .github/workflows/gh-build.yml
 delete mode 100644 .github/workflows/gh-test.yml

diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index 7c493505..31446beb 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -1,5 +1,3 @@
-name: Build and test
-
 concurrency:
   group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }}
   cancel-in-progress: true
@@ -12,6 +10,7 @@ on:
 
 jobs:
   ci:
+    name: "CI"
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 0f10fcff..f7296823 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -25,45 +25,80 @@ on:
 
 jobs:
   build:
-    name: Build wheels
+    name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}")
     if: ${{ github.repository_owner == 'nvidia' }}
-    uses:
-      ./.github/workflows/gh-build.yml
-    with:
-      client-repo: ${{ github.event.repository.name }}
-      target-device: ${{ inputs.target-device }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') ||
-                   (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
-                   (inputs.host-platform == 'win-x64' && 'windows-2019') }}
-                 #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
-      build-type: ${{ inputs.build-type }}
-      host-platform: ${{ inputs.host-platform }}
-      build-mode: ${{ inputs.build-mode }}
-      upload-enabled: ${{ inputs.upload-enabled }}
-      python-version: ${{ inputs.python-version }}
-      cuda-version: ${{ inputs.cuda-version }}
-      dependencies-file: ""
+    permissions:
+      id-token: write # This is required for configure-aws-credentials
+      contents: read  # This is required for actions/checkout
+    runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-cpu8') ||
+                 (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
+                 (inputs.host-platform == 'win-x64' && 'windows-2019') }}
+               #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
     secrets: inherit
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up build environment
+        uses: ./.github/actions/setup
+        with:
+          client-repo: ${{ github.event.repository.name }}
+          build-type: ${{ inputs.build-type }}
+          target-device: "${{ inputs.target-device }}"
+          host-platform: ${{ inputs.host-platform }}
+          build-mode: ${{ inputs.build-mode }}
+          upload-enabled: ${{ inputs.upload-enabled }}
+          python-version: ${{ inputs.python-version }}
+          cuda-version: ${{ inputs.cuda-version }}
+
+      - name: Call build action
+        uses: ./.github/actions/build
+        with:
+          build-type: ${{ inputs.build-type }}
+          target-device: "${{ inputs.target-device }}"
+          host-platform: ${{ inputs.host-platform }}
+          upload-enabled: ${{ inputs.upload-enabled }}
 
   test:
-    name: Test against wheels
+    # TODO: improve the name once a separate test matrix is defined
+    name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }})
     # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up
     if: ${{ (github.repository_owner == 'nvidia') &&
              startsWith(inputs.host-platform, 'linux-x64') }}
+    permissions:
+      id-token: write # This is required for configure-aws-credentials
+      contents: read  # This is required for actions/checkout
+    runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
+    secrets: inherit
+    container:
+      options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
+      image: condaforge/miniforge3:latest
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
     needs:
       - build
-    uses:
-      ./.github/workflows/gh-test.yml
-    with:
-      client-repo: ${{ github.event.repository.name }}
-      target-device: ${{ inputs.target-device }}
-      test-options: ${{ inputs.build-type }}
-      runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
-      build-type: ${{ inputs.build-type }}
-      host-platform: ${{ inputs.host-platform }}
-      dependencies-file: ""
-      build-mode: ${{ inputs.build-mode }}
-      upload-enabled: ${{ inputs.upload-enabled }}
-      python-version: ${{ inputs.python-version }}
-      has-built: ${{ needs.build.outputs.has-built }}
-    secrets: inherit
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      # TODO: we probably don't need this?
+      # - name: Setup
+      #   if: ${{ inputs.has-built != 'true' }}
+      #   uses: ./.github/actions/setup
+      #   with:
+      #     client-repo: ${{ github.event.repository.name }}
+      #     build-type: ${{ inputs.build-type }}
+      #     target-device: "${{ inputs.target-device }}"
+      #     host-platform: ${{ inputs.host-platform }}
+      #     build-mode: ${{ inputs.build-mode }}
+      #     upload-enabled: ${{ inputs.upload-enabled }}
+      #     python-version: ${{ inputs.python-version }}
+
+      - name: Call test action
+        uses: ./.github/actions/test
+        with:
+          test-options: ${{ inputs.build-type }}
diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
deleted file mode 100644
index 46026ba9..00000000
--- a/.github/workflows/gh-build.yml
+++ /dev/null
@@ -1,77 +0,0 @@
-name: Build
-
-on:
-  workflow_call:
-    inputs:
-      client-repo:
-        required: true
-        type: string
-      target-device:
-        required: true
-        type: string
-      runs-on:
-        required: true
-        type: string
-      build-type:
-        required: true
-        type: string
-        description: One of ci / release
-      host-platform:
-        required: true
-        type: string
-      dependencies-file:
-        required: true
-        type: string
-        description: path to versions.json relative to the target repo dir
-      build-mode:
-        required: true
-        type: string
-      upload-enabled:
-        required: true
-        type: boolean
-      python-version:
-        required: true
-        type: string
-      cuda-version:
-        required: true
-        type: string
-    outputs:
-      has-built:
-        value: true  # TODO: we might need to check the job success here
-        description: whether the built stage was launched (and passed)
-
-jobs:
-  build:
-    name: Build (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}")
-
-    permissions:
-      id-token: write # This is required for configure-aws-credentials
-      contents: read  # This is required for actions/checkout
-
-    runs-on: ${{ inputs.runs-on }}
-
-    steps:
-      - name: Checkout ${{ inputs.client-repo }}
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up build environment
-        uses: ./.github/actions/setup
-        with:
-          client-repo: ${{ inputs.client-repo }}
-          build-type: ${{ inputs.build-type }}
-          target-device: "${{ inputs.target-device }}"
-          host-platform: ${{ inputs.host-platform }}
-          build-mode: ${{ inputs.build-mode }}
-          upload-enabled: ${{ inputs.upload-enabled }}
-          python-version: ${{ inputs.python-version }}
-          cuda-version: ${{ inputs.cuda-version }}
-
-      - name: Call build action
-        uses: ./.github/actions/build
-        with:
-          build-type: ${{ inputs.build-type }}
-          target-device: "${{ inputs.target-device }}"
-          host-platform: ${{ inputs.host-platform }}
-          upload-enabled: ${{ inputs.upload-enabled }}
diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
deleted file mode 100644
index 8216ce10..00000000
--- a/.github/workflows/gh-test.yml
+++ /dev/null
@@ -1,80 +0,0 @@
-name: Test
-
-on:
-  workflow_call:
-    inputs:
-      client-repo:
-        required: true
-        type: string
-      target-device:
-        required: true
-        type: string
-      test-options:
-        required: true
-        type: string
-      runs-on:
-        required: true
-        type: string
-      build-type:
-        required: true
-        type: string
-        description: One of ci / release
-      host-platform:
-        required: true
-        type: string
-      dependencies-file:
-        required: true
-        type: string
-        description: path to versions.json relative to the target repo dir
-      build-mode:
-        required: true
-        type: string
-      upload-enabled:
-        required: true
-        type: boolean
-      python-version:
-        required: false
-        type: string
-      has-built:
-        required: false
-        type: string
-        description: whether the built stage was launched (and passed)
-
-jobs:
-  build:
-    name: Test (${{ inputs.host-platform }}, Python "${{ inputs.python-version }}", Use container=${{ inputs.use-container }} )
-
-    permissions:
-      id-token: write # This is required for configure-aws-credentials
-      contents: read  # This is required for actions/checkout
-
-    runs-on: ${{ inputs.runs-on }}
-
-    container:
-      options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
-      image: condaforge/miniforge3:latest
-      env:
-        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
-
-    steps:
-      - name: Checkout ${{ inputs.client-repo }}
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup
-        if: ${{ inputs.has-built != 'true' }}
-        uses: ./.github/actions/setup
-        with:
-          client-repo: ${{ inputs.client-repo }}
-          build-type: ${{ inputs.build-type }}
-          target-device: "${{ inputs.target-device }}"
-          host-platform: ${{ inputs.host-platform }}
-          build-mode: ${{ inputs.build-mode }}
-          upload-enabled: ${{ inputs.upload-enabled }}
-          python-version: ${{ inputs.python-version }}
-
-      - name: Call test action
-        uses: ./.github/actions/test
-        with:
-          test-options: ${{ inputs.test-options }}

From ca7b437189395454d8bc09ca5bae79b7862dff5c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 00:49:41 +0000
Subject: [PATCH 082/111] no need to pass secrets as we don't have reusable
 workflows anymore

---
 .github/workflows/gh-build-and-test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index f7296823..c9247f4e 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -34,7 +34,6 @@ jobs:
                  (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
                  (inputs.host-platform == 'win-x64' && 'windows-2019') }}
                #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
-    secrets: inherit
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@v4
@@ -71,7 +70,6 @@ jobs:
       id-token: write # This is required for configure-aws-credentials
       contents: read  # This is required for actions/checkout
     runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
-    secrets: inherit
     container:
       options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
       image: condaforge/miniforge3:latest

From 5d0b014250d5313d2dffc40a5aba0ece12e705d2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 01:13:24 +0000
Subject: [PATCH 083/111] pass job outputs explicitly...

---
 .github/workflows/ci-gh.yml             |  3 ++-
 .github/workflows/gh-build-and-test.yml | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index 31446beb..189258eb 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -1,3 +1,5 @@
+name: "CI"
+
 concurrency:
   group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }}
   cancel-in-progress: true
@@ -10,7 +12,6 @@ on:
 
 jobs:
   ci:
-    name: "CI"
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index c9247f4e..a7d2919a 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -60,6 +60,19 @@ jobs:
           host-platform: ${{ inputs.host-platform }}
           upload-enabled: ${{ inputs.upload-enabled }}
 
+      - name: Pass environment variables
+        id: pass_env
+        run: |
+          echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT
+          echo "CUDA_CORE_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT
+          echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT
+          echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT
+    outputs:
+      CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }}
+      CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }}
+      CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
+      CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
+
   test:
     # TODO: improve the name once a separate test matrix is defined
     name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }})
@@ -100,3 +113,8 @@ jobs:
         uses: ./.github/actions/test
         with:
           test-options: ${{ inputs.build-type }}
+        env:
+          CUDA_CORE_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_CORE_ARTIFACT_NAME }}
+          CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }}
+          CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
+          CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}

From 7350965130c2172d59657515c7a1e834e859198c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 01:16:35 +0000
Subject: [PATCH 084/111] try changing the order

---
 .github/workflows/gh-build-and-test.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index a7d2919a..5c1f1f29 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -34,6 +34,11 @@ jobs:
                  (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
                  (inputs.host-platform == 'win-x64' && 'windows-2019') }}
                #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
+    outputs:
+      CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }}
+      CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }}
+      CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
+      CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@v4
@@ -65,13 +70,8 @@ jobs:
         run: |
           echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT
           echo "CUDA_CORE_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT
-          echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_NAME}" >> $GITHUB_OUTPUT
-          echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_CORE_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT
-    outputs:
-      CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }}
-      CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }}
-      CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
-      CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_NAME}" >> $GITHUB_OUTPUT
+          echo "CUDA_BINDINGS_ARTIFACTS_DIR=${CUDA_BINDINGS_ARTIFACTS_DIR}" >> $GITHUB_OUTPUT
 
   test:
     # TODO: improve the name once a separate test matrix is defined

From eedffd9776bfe63c3967f089803e480d3017c37d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 01:22:14 +0000
Subject: [PATCH 085/111] fix syntax

---
 .github/workflows/gh-build-and-test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 5c1f1f29..e7659a6f 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -35,10 +35,10 @@ jobs:
                  (inputs.host-platform == 'win-x64' && 'windows-2019') }}
                #  (inputs.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
     outputs:
-      CUDA_CORE_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }}
-      CUDA_CORE_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }}
-      CUDA_BINDINGS_ARTIFACT_NAME=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
-      CUDA_BINDINGS_ARTIFACTS_DIR=${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
+      CUDA_CORE_ARTIFACT_NAME: ${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACT_NAME }}
+      CUDA_CORE_ARTIFACTS_DIR: ${{ steps.pass_env.outputs.CUDA_CORE_ARTIFACTS_DIR }}
+      CUDA_BINDINGS_ARTIFACT_NAME: ${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
+      CUDA_BINDINGS_ARTIFACTS_DIR: ${{ steps.pass_env.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@v4

From c7d3e0322c2c9a077d8a657de2cf946345b0cd4e Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 01:37:29 +0000
Subject: [PATCH 086/111] fix workflow merge error

---
 .github/actions/test/action.yml | 2 +-
 .github/workflows/ci-gh.yml     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 675263fb..edfcee3b 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -41,4 +41,4 @@ runs:
     - name: Run test / analysis
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        "${{ env.REPO_DIR }}/continuous_integration/scripts/entrypoint" "${{ env.REPO_DIR }}/continuous_integration/scripts/test" ${{ inputs.test-options }}
+        "./continuous_integration/scripts/entrypoint" "./continuous_integration/scripts/test" ${{ inputs.test-options }}
diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index 189258eb..cb27c879 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -34,6 +34,7 @@ jobs:
           # Note: this is for build-time only; the test-time matrix needs to be
           # defined separately.
           - "12.6.2"
+    name: "CI"
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:

From 35c244f3833d9e749d414e22710e2c091330832c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:00:52 +0000
Subject: [PATCH 087/111] allow mini-ctk to be cached & reused in tests

---
 .github/actions/setup/action.yml        | 40 ++++++++++++++++++++
 .github/actions/test/action.yml         | 49 ++++++++++++++++++++++++-
 .github/workflows/gh-build-and-test.yml |  2 +
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index e00cf27f..bed73c57 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -47,7 +47,21 @@ runs:
       run: |
         env
 
+    - name: Set up CTK cache variable
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV
+
+    - name: Download CTK cache
+      id: ctk-get-cache
+      uses: actions/download-artifact@v4
+      continue-on-error: true
+      with:
+        name: ${{ env.CTK_CACHE_FILENAME }}
+        path: .
+
     - name: Get CUDA components
+      if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         CUDA_PATH="./cuda_toolkit"
@@ -90,15 +104,41 @@ runs:
         }
 
         # Get headers and shared libraries in place
+        # Note: the existing artifact would need to be manually deleted (ex: through web UI)
+        # if this list is changed, as the artifact actions do not offer any option for us to
+        # invalidate the artifact.
         populate_cuda_path cuda_nvcc
         populate_cuda_path cuda_cudart
         populate_cuda_path cuda_nvrtc
         populate_cuda_path cuda_profiler_api
         ls -l $CUDA_PATH
 
+        # Prepare the cache
+        tar cf - $CUDA_PATH | xz -z -T0 - > $CTK_CACHE_FILENAME
+
         # Note: the headers will be copied into the cibuildwheel manylinux container,
         # so setting the CUDA_PATH env var here is meaningless.
 
+    - name: Upload CTK cache
+      if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
+      uses: actions/upload-artifact@v4
+      with:
+        pattern: ${{ env.CTK_CACHE_FILENAME }}
+        path: .
+        if-no-files-found: error
+
+    - name: Restore CTK cache
+      if: ${{ steps.ctk-get-cache.outcome == 'success' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        CUDA_PATH="./cuda_toolkit"
+        mkdir $CUDA_PATH
+        tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        ls -l $CUDA_PATH
+        if [ ! -d "$CUDA_PATH/include" ]; then
+          exit 1
+        fi
+
     - name: Set environment variables
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index edfcee3b..0881b645 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -38,7 +38,54 @@ runs:
         pwd
         ls -lahR $CUDA_CORE_ARTIFACTS_DIR
 
+    - name: Set up Python ${{ env.PYTHON_VERSION }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Set up CTK cache variable
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV
+
+    - name: Download CTK cache
+      id: ctk-get-cache
+      uses: actions/download-artifact@v4
+      continue-on-error: true
+      with:
+        name: ${{ env.CTK_CACHE_FILENAME }}
+        path: .
+
+    - name: Restore CTK cache
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        CUDA_PATH="./cuda_toolkit"
+        mkdir $CUDA_PATH
+        tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        ls -l $CUDA_PATH
+        if [ ! -d "$CUDA_PATH/include" ]; then
+          exit 1
+        fi
+
+        # TODO: check if we really need these for tests?
+        echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV
+        echo "PATH=$PATH:$CUDA_PATH/bin" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib" >> $GITHUB_ENV
+
     - name: Run test / analysis
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        "./continuous_integration/scripts/entrypoint" "./continuous_integration/scripts/test" ${{ inputs.test-options }}
+        REPO_DIR=$(pwd)
+
+        cd "${CUDA_BINDINGS_ARTIFACTS_DIR}"
+        pip install *.whl
+
+        cd "${CUDA_CORE_ARTIFACTS_DIR}"
+        pip install *.whl
+
+        cd "${REPO_DIR}/cuda_bindings"
+        pytest tests/
+        #pytest tests/cython
+
+        cd "${REPO_DIR}/cuda_core"
+        pytest tests/
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index e7659a6f..23663aad 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -83,6 +83,7 @@ jobs:
       id-token: write # This is required for configure-aws-credentials
       contents: read  # This is required for actions/checkout
     runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
+    # TODO: use a different (nvidia?) container, or just run on bare image
     container:
       options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
       image: condaforge/miniforge3:latest
@@ -118,3 +119,4 @@ jobs:
           CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }}
           CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
           CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
+          PYTHON_VERSION: ${{ inputs-python-version }}

From 5615d5429ddc0f1ba752260c432d12dddd8babb6 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:02:39 +0000
Subject: [PATCH 088/111] fix typo

---
 .github/workflows/gh-build-and-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 23663aad..d8dce525 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -119,4 +119,4 @@ jobs:
           CUDA_CORE_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_CORE_ARTIFACTS_DIR }}
           CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
           CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
-          PYTHON_VERSION: ${{ inputs-python-version }}
+          PYTHON_VERSION: ${{ inputs.python-version }}

From bb5fed32339d2e1679c97728ecc774af181bb6ff Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:17:22 +0000
Subject: [PATCH 089/111] try to escape | and > ...

---
 .github/actions/setup/action.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index bed73c57..567b2d25 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -114,7 +114,8 @@ runs:
         ls -l $CUDA_PATH
 
         # Prepare the cache
-        tar cf - $CUDA_PATH | xz -z -T0 - > $CTK_CACHE_FILENAME
+        # Note: try to escape | and > ...
+        echo "$(tar cf - ${CUDA_PATH} | xz -z -T0 - > ${CTK_CACHE_FILENAME})"
 
         # Note: the headers will be copied into the cibuildwheel manylinux container,
         # so setting the CUDA_PATH env var here is meaningless.

From 9d6e69bae192998abe56b52f43511c87a42fdce0 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:36:25 +0000
Subject: [PATCH 090/111] switch to gz for simplicity

---
 .github/actions/setup/action.yml | 6 +++---
 .github/actions/test/action.yml  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 567b2d25..2908399f 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -50,7 +50,7 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache
@@ -115,7 +115,7 @@ runs:
 
         # Prepare the cache
         # Note: try to escape | and > ...
-        echo "$(tar cf - ${CUDA_PATH} | xz -z -T0 - > ${CTK_CACHE_FILENAME})"
+        tar -czvf ${CTK_CACHE_FILENAME} ${CUDA_PATH}
 
         # Note: the headers will be copied into the cibuildwheel manylinux container,
         # so setting the CUDA_PATH env var here is meaningless.
@@ -134,7 +134,7 @@ runs:
       run: |
         CUDA_PATH="./cuda_toolkit"
         mkdir $CUDA_PATH
-        tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
           exit 1
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 0881b645..559d05fe 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -46,7 +46,7 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.xz" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache
@@ -61,7 +61,7 @@ runs:
       run: |
         CUDA_PATH="./cuda_toolkit"
         mkdir $CUDA_PATH
-        tar -xvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
           exit 1

From 8ec609063e8c69966c9f06410f1b06e4692091d5 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:47:36 +0000
Subject: [PATCH 091/111] fix artifact parallel upload & lack of cache key

---
 .github/actions/setup/action.yml | 10 ++++++----
 .github/actions/test/action.yml  |  5 +++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 2908399f..6c84d4c7 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -50,6 +50,7 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
+        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV
         echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
@@ -57,8 +58,8 @@ runs:
       uses: actions/download-artifact@v4
       continue-on-error: true
       with:
-        name: ${{ env.CTK_CACHE_FILENAME }}
-        path: .
+        name: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
 
     - name: Get CUDA components
       if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
@@ -123,9 +124,10 @@ runs:
     - name: Upload CTK cache
       if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
       uses: actions/upload-artifact@v4
+      continue-on-error: true
       with:
-        pattern: ${{ env.CTK_CACHE_FILENAME }}
-        path: .
+        name: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
         if-no-files-found: error
 
     - name: Restore CTK cache
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 559d05fe..4fab178f 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -46,6 +46,7 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
+        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV
         echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
@@ -53,8 +54,8 @@ runs:
       uses: actions/download-artifact@v4
       continue-on-error: true
       with:
-        name: ${{ env.CTK_CACHE_FILENAME }}
-        path: .
+        name: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
 
     - name: Restore CTK cache
       shell: bash --noprofile --norc -xeuo pipefail {0}

From 94561772a8ab10df6f8daac424eff8b085ae6b08 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 03:51:50 +0000
Subject: [PATCH 092/111] fix download path

---
 .github/actions/setup/action.yml | 3 ++-
 .github/actions/test/action.yml  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 6c84d4c7..2c27ce94 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -59,7 +59,7 @@ runs:
       continue-on-error: true
       with:
         name: ${{ env.CTK_CACHE_KEY }}
-        path: ./${{ env.CTK_CACHE_FILENAME }}
+        path: ./
 
     - name: Get CUDA components
       if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
@@ -136,6 +136,7 @@ runs:
       run: |
         CUDA_PATH="./cuda_toolkit"
         mkdir $CUDA_PATH
+        ls -l
         tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 4fab178f..a1480810 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -55,13 +55,14 @@ runs:
       continue-on-error: true
       with:
         name: ${{ env.CTK_CACHE_KEY }}
-        path: ./${{ env.CTK_CACHE_FILENAME }}
+        path: ./
 
     - name: Restore CTK cache
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         CUDA_PATH="./cuda_toolkit"
         mkdir $CUDA_PATH
+        ls -l
         tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then

From 2f2046d1e5f7f149fa756e7dbcc6468c81cfe98f Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 04:02:19 +0000
Subject: [PATCH 093/111] fix extract

---
 .github/actions/setup/action.yml | 5 ++---
 .github/actions/test/action.yml  | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 2c27ce94..22cd8121 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -134,10 +134,9 @@ runs:
       if: ${{ steps.ctk-get-cache.outcome == 'success' }}
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        CUDA_PATH="./cuda_toolkit"
-        mkdir $CUDA_PATH
         ls -l
-        tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        CUDA_PATH="./cuda_toolkit"
+        tar -xzvf $CTK_CACHE_FILENAME
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
           exit 1
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index a1480810..e013b1d2 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -60,10 +60,9 @@ runs:
     - name: Restore CTK cache
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        CUDA_PATH="./cuda_toolkit"
-        mkdir $CUDA_PATH
         ls -l
-        tar -xzvf $CTK_CACHE_FILENAME -C $CUDA_PATH --strip-components=1
+        CUDA_PATH="./cuda_toolkit"
+        tar -xzvf $CTK_CACHE_FILENAME
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
           exit 1

From c361ad8f752af8a6e0a30b3299937b7a86c3244a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 04:13:46 +0000
Subject: [PATCH 094/111] propagate cuda-version

---
 .github/actions/test/action.yml         | 4 ++--
 .github/workflows/gh-build-and-test.yml | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index e013b1d2..2703af16 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -46,8 +46,8 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV
-        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
+        echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index d8dce525..556c47f2 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -120,3 +120,4 @@ jobs:
           CUDA_BINDINGS_ARTIFACT_NAME: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACT_NAME }}
           CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
           PYTHON_VERSION: ${{ inputs.python-version }}
+          CTK_BUILD_VER: ${{ inputs.cuda-version }}

From 37a3bb15f92adc6365545a72e429a4388caaba79 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 23:26:36 -0500
Subject: [PATCH 095/111] install binding test deps

---
 .github/actions/test/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 2703af16..1328cbf2 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -85,6 +85,7 @@ runs:
         pip install *.whl
 
         cd "${REPO_DIR}/cuda_bindings"
+        pip install -r requirements.txt
         pytest tests/
         #pytest tests/cython
 

From 54707dcadbd55d263f2f9300f48d642989470434 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 23:41:12 -0500
Subject: [PATCH 096/111] fix paths

---
 .github/actions/test/action.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 1328cbf2..a04e9ccf 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -61,14 +61,13 @@ runs:
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         ls -l
-        CUDA_PATH="./cuda_toolkit"
+        CUDA_PATH="$(pwd)/cuda_toolkit"
         tar -xzvf $CTK_CACHE_FILENAME
         ls -l $CUDA_PATH
         if [ ! -d "$CUDA_PATH/include" ]; then
           exit 1
         fi
 
-        # TODO: check if we really need these for tests?
         echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV
         echo "PATH=$PATH:$CUDA_PATH/bin" >> $GITHUB_ENV
         echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib" >> $GITHUB_ENV

From 3faf8a3da359d8af3483226b5dee33eb330f0afd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Dec 2024 23:56:05 -0500
Subject: [PATCH 097/111] include nvjitlink to mini CTK for testing

---
 .github/actions/setup/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 22cd8121..bed9a63d 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -112,6 +112,7 @@ runs:
         populate_cuda_path cuda_cudart
         populate_cuda_path cuda_nvrtc
         populate_cuda_path cuda_profiler_api
+        populate_cuda_path libnvjitlink
         ls -l $CUDA_PATH
 
         # Prepare the cache

From e35706aa13c699c91e2b6076310ba75e4dc2228b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 05:19:14 +0000
Subject: [PATCH 098/111] ensure cupy is an optional test dependency

---
 cuda_core/tests/example_tests/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py
index 3d218a91..81479903 100644
--- a/cuda_core/tests/example_tests/utils.py
+++ b/cuda_core/tests/example_tests/utils.py
@@ -10,7 +10,6 @@
 import os
 import sys
 
-import cupy as cp
 import pytest
 
 
@@ -53,4 +52,3 @@ def run_example(samples_path, filename, env=None):
         sys.argv = old_argv
         # further reduce the memory watermark
         gc.collect()
-        cp.get_default_memory_pool().free_all_blocks()

From e0c610e08ec20fa94f3d6c2fa39964c5597fa15a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 05:31:07 +0000
Subject: [PATCH 099/111] per arch ctk

---
 .github/actions/setup/action.yml | 4 ++--
 .github/actions/test/action.yml  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index bed9a63d..2416fcf4 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -50,8 +50,8 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}" >> $GITHUB_ENV
-        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}.tar.gz" >> $GITHUB_ENV
+        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index a04e9ccf..0a1e621a 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -46,8 +46,8 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}" >> $GITHUB_ENV
-        echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}.tar.gz" >> $GITHUB_ENV
+        echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache

From fb487d0fedb9b91ef2d73024d407a9384aa657d3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 05:41:57 +0000
Subject: [PATCH 100/111] fix arg passing

---
 .github/actions/test/action.yml         | 4 ++--
 .github/workflows/gh-build-and-test.yml | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 0a1e621a..c82a8450 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -46,8 +46,8 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}" >> $GITHUB_ENV
-        echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV
+        echo "CTK_CACHE_KEY=mini-ctk-${CTK_BUILD_VER}-${HOST_PLATFORM}" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${CTK_BUILD_VER}-${HOST_PLATFORM}.tar.gz" >> $GITHUB_ENV
 
     - name: Download CTK cache
       id: ctk-get-cache
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index 556c47f2..06f6a168 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -75,7 +75,7 @@ jobs:
 
   test:
     # TODO: improve the name once a separate test matrix is defined
-    name: Test (CUDA ${{ inputs.cuda-version }}, Use container=${{ inputs.use-container }})
+    name: Test (CUDA ${{ inputs.cuda-version }})
     # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up
     if: ${{ (github.repository_owner == 'nvidia') &&
              startsWith(inputs.host-platform, 'linux-x64') }}
@@ -121,3 +121,4 @@ jobs:
           CUDA_BINDINGS_ARTIFACTS_DIR: ${{ needs.build.outputs.CUDA_BINDINGS_ARTIFACTS_DIR }}
           PYTHON_VERSION: ${{ inputs.python-version }}
           CTK_BUILD_VER: ${{ inputs.cuda-version }}
+          HOST_PLATFORM: ${{ inputs.host-platform }}

From c01e015c26d8d7f0e775865559d69e8aa3d7e823 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 05:55:48 +0000
Subject: [PATCH 101/111] fix invalid context during test teardown

---
 cuda_core/tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 59e5883f..b67eeec2 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -30,6 +30,10 @@ def init_cuda():
 
 
 def _device_unset_current():
+    ctx = handle_return(driver.cuCtxGetCurrent())
+    if int(ctx) == 0:
+        # no active context, do nothing
+        return
     handle_return(driver.cuCtxPopCurrent())
     with _device._tls_lock:
         del _device._tls.devices

From f1d0e4027231adfbd44a6e70070d939e5e129d97 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 06:09:32 +0000
Subject: [PATCH 102/111] WAR: mark PTX test xfail due to CI condition

---
 cuda_core/tests/test_program.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 95c4d377..562f89de 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -8,10 +8,20 @@
 
 import pytest
 
+from cuda import cuda, nvrtc
 from cuda.core.experimental import Program
 from cuda.core.experimental._module import Kernel, ObjectCode
 
 
+@pytest.fixture
+def can_load_generated_ptx():
+    _, driver_ver = cuda.cuDriverGetVersion()
+    _, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion()
+    if nvrtc_major * 1000 + nvrtc_minor * 10 > driver_ver:
+        return False
+    return True
+
+
 def test_program_init_valid_code_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
@@ -31,6 +41,8 @@ def test_program_init_invalid_code_format():
         Program(code, "c++")
 
 
+# TODO: incorporate this check in Program
+@pytest.mark.xfail(not can_load_generated_ptx, reason="PTX version too new")
 def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")

From a08fbc94bb811298a3d9a190bd7d8cc6c72c1fbc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 01:34:16 -0500
Subject: [PATCH 103/111] debug

---
 .github/actions/test/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index c82a8450..78572917 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -89,4 +89,4 @@ runs:
         #pytest tests/cython
 
         cd "${REPO_DIR}/cuda_core"
-        pytest tests/
+        pytest -rxXs tests/

From f36393e65c80d667f9eeed311d52038f0026da7c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 15:15:13 +0000
Subject: [PATCH 104/111] also detect if CUDA is ever initialized

---
 cuda_core/tests/conftest.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index b67eeec2..9c8ed52b 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -18,7 +18,7 @@
 import pytest
 
 from cuda.core.experimental import Device, _device
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils import CUDAError, handle_return
 
 
 @pytest.fixture(scope="function")
@@ -30,10 +30,15 @@ def init_cuda():
 
 
 def _device_unset_current():
-    ctx = handle_return(driver.cuCtxGetCurrent())
-    if int(ctx) == 0:
-        # no active context, do nothing
-        return
+    try:
+        ctx = handle_return(driver.cuCtxGetCurrent())
+    except CUDAError as e:
+        if "CUDA_ERROR_NOT_INITIALIZED" in str(e):
+            return
+    else:
+        if int(ctx) == 0:
+            # no active context, do nothing
+            return
     handle_return(driver.cuCtxPopCurrent())
     with _device._tls_lock:
         del _device._tls.devices

From f3cc6bde2b575bb9f1a4bfe785e345cf57a575a1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 16:26:34 +0000
Subject: [PATCH 105/111] ensure CUDA is init'd at test start time

---
 cuda_core/tests/conftest.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 9c8ed52b..58cc7cef 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -21,8 +21,14 @@
 from cuda.core.experimental._utils import CUDAError, handle_return
 
 
+@pytest.fixture(scope="session", autouse=True)
+def always_init_cuda():
+    handle_return(driver.cuInit(0))
+
+
 @pytest.fixture(scope="function")
 def init_cuda():
+    # TODO: rename this to e.g. init_context
     device = Device()
     device.set_current()
     yield
@@ -30,15 +36,10 @@ def init_cuda():
 
 
 def _device_unset_current():
-    try:
-        ctx = handle_return(driver.cuCtxGetCurrent())
-    except CUDAError as e:
-        if "CUDA_ERROR_NOT_INITIALIZED" in str(e):
-            return
-    else:
-        if int(ctx) == 0:
-            # no active context, do nothing
-            return
+    ctx = handle_return(driver.cuCtxGetCurrent())
+    if int(ctx) == 0:
+        # no active context, do nothing
+        return
     handle_return(driver.cuCtxPopCurrent())
     with _device._tls_lock:
         del _device._tls.devices
@@ -46,6 +47,7 @@ def _device_unset_current():
 
 @pytest.fixture(scope="function")
 def deinit_cuda():
+    # TODO: rename this to e.g. deinit_context
     yield
     _device_unset_current()
 

From b1f07a38f60c5aa70cd71048b79b7efe6b934094 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 16:41:15 +0000
Subject: [PATCH 106/111] enforce the right CC is passed to NVRTC

---
 cuda_core/tests/conftest.py     | 2 +-
 cuda_core/tests/test_program.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 58cc7cef..fe755738 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -18,7 +18,7 @@
 import pytest
 
 from cuda.core.experimental import Device, _device
-from cuda.core.experimental._utils import CUDAError, handle_return
+from cuda.core.experimental._utils import handle_return
 
 
 @pytest.fixture(scope="session", autouse=True)
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 562f89de..10789856 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -9,7 +9,7 @@
 import pytest
 
 from cuda import cuda, nvrtc
-from cuda.core.experimental import Program
+from cuda.core.experimental import Device, Program
 from cuda.core.experimental._module import Kernel, ObjectCode
 
 
@@ -46,7 +46,9 @@ def test_program_init_invalid_code_format():
 def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
-    object_code = program.compile("ptx")
+    arch = "".join(str(i) for i in Device().compute_capability)
+    object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",))
+    print(object_code._module.decode())
     kernel = object_code.get_kernel("my_kernel")
     assert isinstance(object_code, ObjectCode)
     assert isinstance(kernel, Kernel)

From 8a6738b3aa3d265933a68f58b3b891ee8b7c196a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 12:10:57 -0500
Subject: [PATCH 107/111] fix xfail mark

---
 cuda_core/tests/test_program.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 10789856..f1c24b3e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -13,7 +13,6 @@
 from cuda.core.experimental._module import Kernel, ObjectCode
 
 
-@pytest.fixture
 def can_load_generated_ptx():
     _, driver_ver = cuda.cuDriverGetVersion()
     _, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion()
@@ -42,7 +41,7 @@ def test_program_init_invalid_code_format():
 
 
 # TODO: incorporate this check in Program
-@pytest.mark.xfail(not can_load_generated_ptx, reason="PTX version too new")
+@pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
 def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")

From ed0386a33d9273e766063813c85828b0b5ffda54 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 22:42:25 +0000
Subject: [PATCH 108/111] switch to use github cache to improve reuse

---
 .github/actions/setup/action.yml | 19 +++++++++----------
 .github/actions/test/action.yml  |  7 ++++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 2416fcf4..084f4c2f 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -55,14 +55,14 @@ runs:
 
     - name: Download CTK cache
       id: ctk-get-cache
-      uses: actions/download-artifact@v4
+      uses: actions/cache/restore@v4
       continue-on-error: true
       with:
-        name: ${{ env.CTK_CACHE_KEY }}
-        path: ./
+        key: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
 
     - name: Get CUDA components
-      if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
+      if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }}
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         CUDA_PATH="./cuda_toolkit"
@@ -123,16 +123,15 @@ runs:
         # so setting the CUDA_PATH env var here is meaningless.
 
     - name: Upload CTK cache
-      if: ${{ steps.ctk-get-cache.outcome == 'failure' }}
-      uses: actions/upload-artifact@v4
-      continue-on-error: true
+      if: ${{ always() &&
+              steps.ctk-get-cache.outputs.cache-hit != 'true' }}
+      uses: actions/cache/save@v4
       with:
-        name: ${{ env.CTK_CACHE_KEY }}
+        key: ${{ env.CTK_CACHE_KEY }}
         path: ./${{ env.CTK_CACHE_FILENAME }}
-        if-no-files-found: error
 
     - name: Restore CTK cache
-      if: ${{ steps.ctk-get-cache.outcome == 'success' }}
+      if: ${{ steps.ctk-get-cache.outputs.cache-hit == 'true' }}
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         ls -l
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
index 78572917..66468bd1 100644
--- a/.github/actions/test/action.yml
+++ b/.github/actions/test/action.yml
@@ -51,11 +51,12 @@ runs:
 
     - name: Download CTK cache
       id: ctk-get-cache
-      uses: actions/download-artifact@v4
+      uses: actions/cache/restore@v4
       continue-on-error: true
       with:
-        name: ${{ env.CTK_CACHE_KEY }}
-        path: ./
+        key: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
+        fail-on-cache-miss: true
 
     - name: Restore CTK cache
       shell: bash --noprofile --norc -xeuo pipefail {0}

From 7b074f03794bf3cb2b6cabdf2d77fd7860cbcb86 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 8 Dec 2024 22:54:45 +0000
Subject: [PATCH 109/111] clean up legacy CI scripts

---
 continuous_integration/environment.yml        |  24 ---
 continuous_integration/scripts/build          |  32 ----
 continuous_integration/scripts/conda-utils    |  16 --
 continuous_integration/scripts/entrypoint     |  20 --
 .../scripts/generate-environment              |  36 ----
 continuous_integration/scripts/make-conda-env |  27 ---
 continuous_integration/scripts/setup-utils    | 179 ------------------
 continuous_integration/scripts/test           |  38 ----
 8 files changed, 372 deletions(-)
 delete mode 100644 continuous_integration/environment.yml
 delete mode 100755 continuous_integration/scripts/build
 delete mode 100755 continuous_integration/scripts/conda-utils
 delete mode 100755 continuous_integration/scripts/entrypoint
 delete mode 100755 continuous_integration/scripts/generate-environment
 delete mode 100755 continuous_integration/scripts/make-conda-env
 delete mode 100755 continuous_integration/scripts/setup-utils
 delete mode 100755 continuous_integration/scripts/test

diff --git a/continuous_integration/environment.yml b/continuous_integration/environment.yml
deleted file mode 100644
index 6d922d43..00000000
--- a/continuous_integration/environment.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: cuda_python
-channels:
-  - defaults
-dependencies:
-  - python>=3.10
-  - cython>=3.0.0
-  - pytest>=6.2.4
-  - numpy>=1.21.1
-  - setuptools
-  - wheel
-  - pip
-  - cuda-version=12.6
-  - cuda-cudart-static
-  - cuda-driver-dev
-  - cuda-cudart-dev
-  - cuda-profiler-api
-  - cuda-nvrtc-dev
-  - cuda-nvcc
-  - pip:
-    - pytest-benchmark>=3.4.1
-    - pyclibrary>=0.1.7
-    - versioneer==0.29
-    - tomli; python_version < "3.11"
-    - pywin32; sys_platform == 'win32'
diff --git a/continuous_integration/scripts/build b/continuous_integration/scripts/build
deleted file mode 100755
index 618edd5f..00000000
--- a/continuous_integration/scripts/build
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-
-build_ci() {
-    set -xeou pipefail
-
-    export CUDA_HOME="${CONDA_PREFIX}/targets/x86_64-linux"
-    export PARALLEL_LEVEL=$(nproc --ignore 1)
-
-    cd "${REPO_DIR}/cuda_bindings"
-    python setup.py bdist_wheel
-    
-    cd "${REPO_DIR}/cuda_core"
-    python setup.py bdist_wheel
-}
-
-build_project() {
-    set -xeou pipefail
-
-    export PYTHONUNBUFFERED=1
-
-    . setup-utils;
-    init_build_env "$@";
-
-    git config --global --add safe.directory "$REPO_DIR/.git"
-
-    case "${BUILD_TYPE}" in
-        ci) build_ci;;
-        *) return 1;;
-    esac
-}
-
-(build_project "$@");
diff --git a/continuous_integration/scripts/conda-utils b/continuous_integration/scripts/conda-utils
deleted file mode 100755
index e0dd32ca..00000000
--- a/continuous_integration/scripts/conda-utils
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-activate_conda_env() {
-    set +xu
-    eval "$(conda shell.bash hook)"
-    conda activate "${CONDA_ENV}";
-    set -xu
-    : ${PYTHON_VERSION:=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")}
-    export PYTHON_VERSION
-}
-
-conda_info() {
-    set +x
-    conda info
-    set -x
-}
diff --git a/continuous_integration/scripts/entrypoint b/continuous_integration/scripts/entrypoint
deleted file mode 100755
index fe4f5cea..00000000
--- a/continuous_integration/scripts/entrypoint
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-set_initial_env() {
-    set -xeuo pipefail
-
-    export PATH="${PATH}:${REPO_DIR}/continuous_integration/scripts"
-}
-
-entrypoint() {
-    set -xeuo pipefail
-    set_initial_env;
-
-    git config --global --add safe.directory "$REPO_DIR/.git"
-
-    cd "${REPO_DIR}"
-
-    exec "$@";
-}
-
-entrypoint "$@";
diff --git a/continuous_integration/scripts/generate-environment b/continuous_integration/scripts/generate-environment
deleted file mode 100755
index 8bf2c38d..00000000
--- a/continuous_integration/scripts/generate-environment
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# Function to generate environment.yml
-generate_environment_yml() {
-    local python_version=$1
-    local cuda_version=$2
-    local output_path=$3
-
-    cat <<EOF > "${output_path}/environment.yml"
-name: cuda_python
-channels:
-  - defaults
-  - conda-forge
-dependencies:
-  - python=${python_version}
-  - cython
-  - pytest
-  - numpy
-  - setuptools
-  - wheel
-  - pip
-  - cuda-version=${cuda_version}
-  - cuda-cudart-static
-  - cuda-driver-dev
-  - cuda-cudart-dev
-  - cuda-profiler-api
-  - cuda-nvrtc-dev
-  - cuda-nvcc
-  - pip:
-    - pytest-benchmark
-    - pyclibrary
-    - versioneer==0.29
-    - tomli; python_version < "3.11"
-    - pywin32; sys_platform == 'win32'
-EOF
-}
\ No newline at end of file
diff --git a/continuous_integration/scripts/make-conda-env b/continuous_integration/scripts/make-conda-env
deleted file mode 100755
index 37539b37..00000000
--- a/continuous_integration/scripts/make-conda-env
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-set -x
-
-make_ci_env() {
-    #TODO wire cuda version as a top level matrix argument
-    generate_environment_yml "${PYTHON_VERSION}" 12.6 .
-    mamba env create -n "${CONDA_ENV}" -f ./environment.yml
-}
-
-make_conda_env() {
-    set -xeuo pipefail
-
-    . setup-utils;
-    . generate-environment
-    set_base_defs;
-
-    case "$1" in
-        ci) make_ci_env;;
-        test) make_test_env;;
-        *) return 1;;
-    esac
-
-    return 0;
-}
-
-(make_conda_env "$@");
diff --git a/continuous_integration/scripts/setup-utils b/continuous_integration/scripts/setup-utils
deleted file mode 100755
index f8faefa4..00000000
--- a/continuous_integration/scripts/setup-utils
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env bash
-
-install_from_apt() {
-    set -xeuo pipefail
-
-    export DEBIAN_FRONTEND=non-interactive
-
-    # Run package updates and install packages
-    apt-get -q update
-    apt-get -q install -y wget curl jq sudo ninja-build vim numactl rsync
-}
-
-install_cmake() {
-    set -xeuo pipefail
-
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.26.5/cmake-3.26.5-linux-x86_64.tar.gz
-
-    tar -xzf cmake-3.26.5-linux-x86_64.tar.gz
-}
-
-setup_linux_build_env() {
-    set -xeuo pipefail
-    export OS_SHORT_NAME=linux
-    export PATH="${PATH}:${PREBUILD_DIR}/cmake-3.26.5-linux-x86_64/bin"
-
-    mkdir -p /tmp/out /tmp/env_yaml
-}
-
-install_linux_tools() {
-    set -xeuo pipefail
-
-    export SED=sed
-    export READLINK=readlink
-
-    install_from_apt;
-    install_cmake;
-
-    mkdir -p /tmp/out /tmp/env_yaml
-}
-
-install_linux_test_tools() {
-    set -xeuo pipefail
-
-    export SED=sed
-    export READLINK=readlink
-
-    # Run package updates and install packages
-    apt-get -q update
-    apt-get -q install -y numactl
-}
-
-set_base_defs() {
-    set -xeuo pipefail
-
-    export CONDA_ENV=cuda_python
-
-    CONDA_PLATFORM=$(conda info | grep 'platform' | awk -F ' : ' '{print $2}')
-    export CONDA_PLATFORM
-
-    export PREBUILD_DIR=/tmp/prebuild
-    mkdir -p "$PREBUILD_DIR"
-
-    export BUILD_DIR="$REPO_DIR/build"
-
-    # Get the machine architecture
-    ARCH=$(uname -m)
-
-    if [ "$ARCH" == "aarch64" ]; then
-        # Use the gcc march value used by aarch64 Ubuntu.
-        BUILD_MARCH=armv8-a
-    else
-        # Use uname -m otherwise
-        BUILD_MARCH=$(uname -m | tr '_' '-')
-    fi
-
-    export BUILD_MARCH
-
-    export CUDA_VERSION=12.2.2
-
-    export MAX_LIBSANITIZER_VERSION=11.4
-
-    export USE_OPENMP=ON
-}
-
-# -----------------------------------------------------------------------------
-
-prep_git() {
-    # Temporarily disable exit on error
-    set +e
-    git config --global user.email > /dev/null
-    local email_exit_status=$?
-    git config --global user.name > /dev/null
-    local name_exit_status=$?
-    # Re-enable exit on error
-    set -e
-
-    if [ $email_exit_status -ne 0 ]; then
-        git config --global --add user.email "users.noreply.github.com"
-        echo "git user.email was not set. It's now set to users.noreply.github.com"
-    else
-        echo "Note: git user.email is already set."
-    fi
-
-    if [ $name_exit_status -ne 0 ]; then
-        git config --global --add user.name "anon"
-        echo "git user.name was not set. It's now set to anon"
-    else
-        echo "Note: git user.name is already set."
-    fi
-
-    # Fix "fatal: detected dubious ownership in repository at '/tmp/legate.core'"
-    # during local builds.
-    git config --global --add safe.directory "$REPO_DIR"
-}
-
-
-setup_build_env() {
-    set -xeuo pipefail
-
-    install_linux_tools;
-
-    setup_linux_build_env;
-
-    rm -rf "$PREBUILD_DIR"
-    mkdir -p "$PREBUILD_DIR"
-    cd $PREBUILD_DIR
-
-    prep_git;
-}
-
-init_build_env() {
-    set -x;
-
-    . conda-utils;
-
-    export BUILD_TYPE=$1
-
-    set -xeuo pipefail;
-
-    set_base_defs;
-
-    cd "$PREBUILD_DIR"
-
-    setup_build_env;
-
-    cd "$REPO_DIR";
-
-    if [[ -d "${BUILD_DIR}" ]]; then
-        rm -rf "${BUILD_DIR}"
-    fi
-
-    make-conda-env "$BUILD_TYPE";
-
-    activate_conda_env;
-    conda_info;
-}
-
-init_test_env() {
-    set -x;
-
-    . conda-utils;
-
-    export TEST_TYPE=$1
-
-    set -xeuo pipefail;
-
-    set_base_defs;
-
-    cd "$PREBUILD_DIR"
-
-    # setup_test_env;
-
-    cd "$REPO_DIR";
-
-    make-conda-env "$TEST_TYPE";
-
-    activate_conda_env;
-    conda_info;
-}
\ No newline at end of file
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
deleted file mode 100755
index 3a705c3c..00000000
--- a/continuous_integration/scripts/test
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-test_ci() {
-    set -xeou pipefail
-
-    activate_conda_env;
-
-    cd "${BINDINGS_ARTIFACTS_DIR}"
-    pip install *.whl
-
-    cd "${CORE_ARTIFACTS_DIR}"
-    pip install *.whl
-
-    cd "${REPO_DIR}/cuda_core"
-    python -m pytest tests/
-
-    cd "${REPO_DIR}/cuda_bindings"
-    python -m pytest tests/
-
-}
-
-test_project() {
-    set -xeou pipefail
-
-    export PYTHONUNBUFFERED=1
-
-    . setup-utils;
-    init_test_env "$@";
-
-    git config --global --add safe.directory "$REPO_DIR/.git"
-
-    case "${TEST_TYPE}" in
-        ci) test_ci;;
-        *) return 1;;
-    esac
-}
-
-(test_project "$@");

From 6a595945a4a25f35cc4cfd312ac797fd30bc435c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 9 Dec 2024 12:06:56 -0800
Subject: [PATCH 110/111] fix build warning and output format of docs

---
 cuda_core/docs/source/conf.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 3a7afc09..4b3e17ae 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -93,15 +93,26 @@
 napoleon_numpy_docstring = True
 
 
+section_titles = ["Returns"]
 def autodoc_process_docstring(app, what, name, obj, options, lines):
     if name.startswith("cuda.core.experimental.system"):
-        # patch the docstring (in lines) *in-place*
+        # patch the docstring (in lines) *in-place*. Should docstrings include section titles other than "Returns", 
+        # this will need to be modified to handle them.
         attr = name.split(".")[-1]
         from cuda.core.experimental._system import System
 
         lines_new = getattr(System, attr).__doc__.split("\n")
+        formatted_lines = []
+        for line in lines_new:
+            title = line.strip()
+            if title in section_titles:
+                formatted_lines.append(line.replace(title, f".. rubric:: {title}"))
+            elif line.strip() == "-" * len(title):
+                formatted_lines.append(" " * len(title))
+            else:
+                formatted_lines.append(line)
         n_pops = len(lines)
-        lines.extend(lines_new)
+        lines.extend(formatted_lines)
         for _ in range(n_pops):
             lines.pop(0)
 

From 769ac6679e06b10eb609d8df1cfb19fb58cdf9c4 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Mon, 9 Dec 2024 12:10:13 -0800
Subject: [PATCH 111/111] Update cuda_core/docs/source/api_private.rst

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/docs/source/api_private.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index a833d69c..f100eb7c 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -16,7 +16,6 @@ CUDA runtime
    _memory.Buffer
    _stream.Stream
    _event.Event
-   _system.System
 
 
 CUDA compilation toolchain