From 521be20ef9cc94a256b985f8954c673fed2bf620 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 12 Jul 2024 11:47:11 -0700
Subject: [PATCH 01/52] off the ground

---
 numba_cuda/numba/cuda/codegen.py     |  12 +++
 numba_cuda/numba/cuda/cudadrv/nrt.py | 126 +++++++++++++++++++++++++++
 numba_cuda/numba/cuda/runtime/nrt.cu | 123 ++++++++++++++++++++++++++
 numba_cuda/numba/cuda/target.py      |   1 +
 4 files changed, 262 insertions(+)
 create mode 100644 numba_cuda/numba/cuda/cudadrv/nrt.py
 create mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index 6009dcb..cd52d95 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -4,6 +4,7 @@
 from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
+import functools
 
 import os
 import subprocess
@@ -12,6 +13,14 @@
 
 CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
 
+@functools.cache
+def make_nrt():
+    from numba.cuda import descriptor
+    from numba.cuda.cudadrv.nrt import compile_nrt_functions
+
+    library = compile_nrt_functions(descriptor.cuda_target.target_context)
+    numba_cuda_runtime = library.get_asm_str().encode()
+    return numba_cuda_runtime
 
 def run_nvdisasm(cubin, flags):
     # nvdisasm only accepts input from a file, so we need to write out to a
@@ -180,6 +189,9 @@ def get_cubin(self, cc=None):
 
         linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
 
+#        numba_cuda_runtime = make_nrt()
+ #       linker.add_ptx(numba_cuda_runtime)
+
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
             linker.add_ltoir(ltoir)
diff --git a/numba_cuda/numba/cuda/cudadrv/nrt.py b/numba_cuda/numba/cuda/cudadrv/nrt.py
new file mode 100644
index 0000000..7aad320
--- /dev/null
+++ b/numba_cuda/numba/cuda/cudadrv/nrt.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from llvmlite import ir
+from numba.core import cgutils, config
+from numba.core.runtime.nrtdynmod import (
+    _define_atomic_cas,
+    _define_atomic_inc_dec,
+    _define_nrt_incref,
+    _define_nrt_meminfo_data,
+    _pointer_type,
+    incref_decref_ty,
+)
+
+
+
+
+def _define_nrt_decref(module, atomic_decr):
+    """
+    Implement NRT_decref in the module
+    """
+    fn_decref = cgutils.get_or_insert_function(
+        module, incref_decref_ty, "NRT_decref"
+    )
+    # Cannot inline this for refcount pruning to work
+
+    fn_decref.attributes.add("noinline")
+    calldtor = ir.Function(
+        module,
+        ir.FunctionType(ir.VoidType(), [_pointer_type, _pointer_type]),
+        name="NRT_MemInfo_call_dtor",
+    )
+
+    builder = ir.IRBuilder(fn_decref.append_basic_block())
+    [ptr] = fn_decref.args
+    is_null = builder.icmp_unsigned(
+        "==", ptr, cgutils.get_null_value(ptr.type)
+    )
+    with cgutils.if_unlikely(builder, is_null):
+        builder.ret_void()
+
+    # For memory fence usage, see https://llvm.org/docs/Atomics.html
+
+    # A release fence is used before the relevant write operation.
+    # No-op on x86.  On POWER, it lowers to lwsync.
+    #    builder.fence("release")
+
+    word_ptr = builder.bitcast(ptr, atomic_decr.args[0].type)
+
+    if config.DEBUG_NRT:
+        cgutils.printf(
+            builder, "*** NRT_Decref %zu [%p]\n", builder.load(word_ptr), ptr
+        )
+    newrefct = builder.call(
+        atomic_decr, [word_ptr]  # nvvmutils.atomic_dec_int64, etc
+    )
+
+    refct_eq_0 = builder.icmp_unsigned(
+        "==", newrefct, ir.Constant(newrefct.type, 0)
+    )
+    with cgutils.if_unlikely(builder, refct_eq_0):
+        # An acquire fence is used after the relevant read operation.
+        # No-op on x86.  On POWER, it lowers to lwsync.
+        #        builder.fence("acquire")
+        builder.call(
+            calldtor,
+            [
+                ptr,
+                ir.Constant.inttoptr(
+                    ir.Constant(ir.IntType(64), memsys), _pointer_type
+                ),
+            ],
+        )
+    builder.ret_void()
+
+
+def create_nrt_module(ctx):
+    """
+    Create an IR module defining the LLVM NRT functions.
+    A (IR module, library) tuple is returned.
+    """
+    codegen = ctx.codegen()
+    library = codegen.create_library("nrt")
+
+    # Implement LLVM module with atomic ops
+    ir_mod = library.create_ir_module("nrt_module")
+
+    atomic_inc = _define_atomic_inc_dec(ir_mod, "add", ordering="monotonic")
+    atomic_dec = _define_atomic_inc_dec(ir_mod, "sub", ordering="monotonic")
+    _define_atomic_cas(ir_mod, ordering="monotonic")
+
+    _define_nrt_meminfo_data(ir_mod)
+    _define_nrt_incref(ir_mod, atomic_inc)
+    _define_nrt_decref(ir_mod, atomic_dec)
+
+    #    _define_nrt_unresolved_abort(ctx, ir_mod)
+    return ir_mod, library
+
+
+def compile_nrt_functions(ctx):
+    """
+    Compile all LLVM NRT functions and return a library containing them.
+    The library is created using the given target context.
+    """
+    ir_mod, library = create_nrt_module(ctx)
+
+    library.add_ir_module(ir_mod)
+    library.finalize()
+    breakpoint()
+
+    return library
+
+
+# compile NRT functions written in lowering into a PTX string
+#library = compile_nrt_functions(descriptor.cuda_target.target_context)
+#_cuda_nrt_library = compile_nrt_functions(
+#    descriptor.cuda_target.target_context
+#)
+#numba_cuda_runtime = _cuda_nrt_library._get_ptxes()[0].encode()
+
+def NRT_MemSys_new():
+    # TODO
+    pass
+
+
+memsys = NRT_MemSys_new()
+numba_cuda_runtime=None
diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
new file mode 100644
index 0000000..f538d3e
--- /dev/null
+++ b/numba_cuda/numba/cuda/runtime/nrt.cu
@@ -0,0 +1,123 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _NRT_H
+#define _NRT_H
+
+#include <cuda/atomic>
+
+typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+
+typedef struct MemInfo NRT_MemInfo;
+
+extern "C" {
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+}
+
+// Globally needed variables
+struct NRT_MemSys {
+  struct {
+    bool enabled;
+    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> free;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+  } stats;
+};
+
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+
+extern "C" __device__ void* NRT_Allocate(size_t size)
+{
+  void* ptr = NULL;
+  ptr       = malloc(size);
+//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  return ptr;
+}
+
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info)
+//                                            NRT_MemSys* TheMSys)
+{
+  mi->refct     = 1; /* starts with 1 refct */
+  mi->dtor      = dtor;
+  mi->dtor_info = dtor_info;
+  mi->data      = data;
+  mi->size      = size;
+//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+}
+
+__device__ NRT_MemInfo* NRT_MemInfo_new(
+  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
+{
+  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
+  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
+  return mi;
+}
+
+extern "C" __device__ void NRT_Free(void* ptr)
+{
+  free(ptr);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+}
+
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
+{
+  NRT_Free(mi);
+}
+
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
+{
+  NRT_dealloc(mi);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+}
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
+{
+  if (mi->dtor) /* We have a destructor */
+    mi->dtor(mi->data, mi->size, NULL);
+  /* Clear and release MemInfo */
+  NRT_MemInfo_destroy(mi);
+}
+
+/*
+  c++ version of the NRT_decref function that usually is added to
+  the final kernel link in PTX form by numba. This version may be
+  used by c++ APIs that accept ownership of live objects and must
+  manage them going forward.
+*/
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
+{
+  mi->refct--;
+  if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
+}
+
+#endif
+
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
+{
+  mi->refct++;
+}
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 6402ff3..f43b099 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -67,6 +67,7 @@ def resolve_value_type(self, val):
 class CUDATargetContext(BaseContext):
     implement_powi_as_math_call = True
     strict_alignment = True
+    enable_nrt = True
 
     def __init__(self, typingctx, target='cuda'):
         super().__init__(typingctx, target)

From 0f9bc4a1810e671fac96915112107c9df10d03ef Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 15 Jul 2024 08:08:50 -0700
Subject: [PATCH 02/52] cleanup

---
 numba_cuda/numba/cuda/codegen.py     |  12 ---
 numba_cuda/numba/cuda/cudadrv/nrt.py | 126 ---------------------------
 numba_cuda/numba/cuda/runtime/nrt.cu |  16 ----
 3 files changed, 154 deletions(-)
 delete mode 100644 numba_cuda/numba/cuda/cudadrv/nrt.py

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index cd52d95..6009dcb 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -4,7 +4,6 @@
 from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
-import functools
 
 import os
 import subprocess
@@ -13,14 +12,6 @@
 
 CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
 
-@functools.cache
-def make_nrt():
-    from numba.cuda import descriptor
-    from numba.cuda.cudadrv.nrt import compile_nrt_functions
-
-    library = compile_nrt_functions(descriptor.cuda_target.target_context)
-    numba_cuda_runtime = library.get_asm_str().encode()
-    return numba_cuda_runtime
 
 def run_nvdisasm(cubin, flags):
     # nvdisasm only accepts input from a file, so we need to write out to a
@@ -189,9 +180,6 @@ def get_cubin(self, cc=None):
 
         linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
 
-#        numba_cuda_runtime = make_nrt()
- #       linker.add_ptx(numba_cuda_runtime)
-
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
             linker.add_ltoir(ltoir)
diff --git a/numba_cuda/numba/cuda/cudadrv/nrt.py b/numba_cuda/numba/cuda/cudadrv/nrt.py
deleted file mode 100644
index 7aad320..0000000
--- a/numba_cuda/numba/cuda/cudadrv/nrt.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from llvmlite import ir
-from numba.core import cgutils, config
-from numba.core.runtime.nrtdynmod import (
-    _define_atomic_cas,
-    _define_atomic_inc_dec,
-    _define_nrt_incref,
-    _define_nrt_meminfo_data,
-    _pointer_type,
-    incref_decref_ty,
-)
-
-
-
-
-def _define_nrt_decref(module, atomic_decr):
-    """
-    Implement NRT_decref in the module
-    """
-    fn_decref = cgutils.get_or_insert_function(
-        module, incref_decref_ty, "NRT_decref"
-    )
-    # Cannot inline this for refcount pruning to work
-
-    fn_decref.attributes.add("noinline")
-    calldtor = ir.Function(
-        module,
-        ir.FunctionType(ir.VoidType(), [_pointer_type, _pointer_type]),
-        name="NRT_MemInfo_call_dtor",
-    )
-
-    builder = ir.IRBuilder(fn_decref.append_basic_block())
-    [ptr] = fn_decref.args
-    is_null = builder.icmp_unsigned(
-        "==", ptr, cgutils.get_null_value(ptr.type)
-    )
-    with cgutils.if_unlikely(builder, is_null):
-        builder.ret_void()
-
-    # For memory fence usage, see https://llvm.org/docs/Atomics.html
-
-    # A release fence is used before the relevant write operation.
-    # No-op on x86.  On POWER, it lowers to lwsync.
-    #    builder.fence("release")
-
-    word_ptr = builder.bitcast(ptr, atomic_decr.args[0].type)
-
-    if config.DEBUG_NRT:
-        cgutils.printf(
-            builder, "*** NRT_Decref %zu [%p]\n", builder.load(word_ptr), ptr
-        )
-    newrefct = builder.call(
-        atomic_decr, [word_ptr]  # nvvmutils.atomic_dec_int64, etc
-    )
-
-    refct_eq_0 = builder.icmp_unsigned(
-        "==", newrefct, ir.Constant(newrefct.type, 0)
-    )
-    with cgutils.if_unlikely(builder, refct_eq_0):
-        # An acquire fence is used after the relevant read operation.
-        # No-op on x86.  On POWER, it lowers to lwsync.
-        #        builder.fence("acquire")
-        builder.call(
-            calldtor,
-            [
-                ptr,
-                ir.Constant.inttoptr(
-                    ir.Constant(ir.IntType(64), memsys), _pointer_type
-                ),
-            ],
-        )
-    builder.ret_void()
-
-
-def create_nrt_module(ctx):
-    """
-    Create an IR module defining the LLVM NRT functions.
-    A (IR module, library) tuple is returned.
-    """
-    codegen = ctx.codegen()
-    library = codegen.create_library("nrt")
-
-    # Implement LLVM module with atomic ops
-    ir_mod = library.create_ir_module("nrt_module")
-
-    atomic_inc = _define_atomic_inc_dec(ir_mod, "add", ordering="monotonic")
-    atomic_dec = _define_atomic_inc_dec(ir_mod, "sub", ordering="monotonic")
-    _define_atomic_cas(ir_mod, ordering="monotonic")
-
-    _define_nrt_meminfo_data(ir_mod)
-    _define_nrt_incref(ir_mod, atomic_inc)
-    _define_nrt_decref(ir_mod, atomic_dec)
-
-    #    _define_nrt_unresolved_abort(ctx, ir_mod)
-    return ir_mod, library
-
-
-def compile_nrt_functions(ctx):
-    """
-    Compile all LLVM NRT functions and return a library containing them.
-    The library is created using the given target context.
-    """
-    ir_mod, library = create_nrt_module(ctx)
-
-    library.add_ir_module(ir_mod)
-    library.finalize()
-    breakpoint()
-
-    return library
-
-
-# compile NRT functions written in lowering into a PTX string
-#library = compile_nrt_functions(descriptor.cuda_target.target_context)
-#_cuda_nrt_library = compile_nrt_functions(
-#    descriptor.cuda_target.target_context
-#)
-#numba_cuda_runtime = _cuda_nrt_library._get_ptxes()[0].encode()
-
-def NRT_MemSys_new():
-    # TODO
-    pass
-
-
-memsys = NRT_MemSys_new()
-numba_cuda_runtime=None
diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
index f538d3e..6ced244 100644
--- a/numba_cuda/numba/cuda/runtime/nrt.cu
+++ b/numba_cuda/numba/cuda/runtime/nrt.cu
@@ -1,19 +1,3 @@
-
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 #ifndef _NRT_H
 #define _NRT_H
 

From 1c3517f943c744f126bed148a806671dead5bc25 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 29 Jul 2024 06:31:15 -0700
Subject: [PATCH 03/52] enough to launch a kernel

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 208 +++++++++++++++++++++++-
 1 file changed, 207 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 875497e..7ec704d 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2596,7 +2596,14 @@ class Linker(metaclass=ABCMeta):
     @classmethod
     def new(cls, max_registers=0, lineinfo=False, cc=None):
         if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
-            return MVCLinker(max_registers, lineinfo, cc)
+            # TODO: circular
+            from . import runtime
+            driver_ver, runtime_ver = driver.get_version(), runtime.get_version()
+            if driver_ver >= (12, 0) and runtime_ver > driver_ver:
+                return PatchedLinker(max_registers, lineinfo, cc)
+            else:
+                return MVCLinker(max_registers, lineinfo, cc)
+
         elif USE_NV_BINDING:
             return CudaPythonLinker(max_registers, lineinfo, cc)
         else:
@@ -2952,6 +2959,205 @@ def complete(self):
         cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
         return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
 
+class LinkableCode:
+    """An object that can be passed in the `link` list argument to `@cuda.jit`
+    kernels to supply code to be linked from memory."""
+
+    def __init__(self, data, name=None):
+        self.data = data
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name or self.default_name
+
+
+class PTXSource(LinkableCode):
+    """PTX Source code in memory"""
+
+    kind = FILE_EXTENSION_MAP["ptx"]
+    default_name = "<unnamed-ptx>"
+
+
+class CUSource(LinkableCode):
+    """CUDA C/C++ Source code in memory"""
+
+    kind = "cu"
+    default_name = "<unnamed-cu>"
+
+
+class Fatbin(LinkableCode):
+    """A fatbin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["fatbin"]
+    default_name = "<unnamed-fatbin>"
+
+
+class Cubin(LinkableCode):
+    """A cubin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["cubin"]
+    default_name = "<unnamed-cubin>"
+
+
+class Archive(LinkableCode):
+    """An archive of objects in memory"""
+
+    kind = FILE_EXTENSION_MAP["a"]
+    default_name = "<unnamed-archive>"
+
+
+class Object(LinkableCode):
+    """An object file in memory"""
+
+    kind = FILE_EXTENSION_MAP["o"]
+    default_name = "<unnamed-object>"
+
+
+class LTOIR(LinkableCode):
+    """An LTOIR file in memory"""
+
+    kind = "ltoir"
+    default_name = "<unnamed-ltoir>"
+
+
+class PatchedLinker(Linker):
+    def __init__(
+        self,
+        max_registers=None,
+        lineinfo=False,
+        cc=None,
+        lto=False,
+        additional_flags=None,
+    ):
+        try:
+            from pynvjitlink.api import NvJitLinker, NvJitLinkError
+        except ImportError as err:
+            raise ImportError(_MVC_ERROR_MESSAGE) from err
+
+        if cc is None:
+            raise RuntimeError("PatchedLinker requires CC to be specified")
+        if not any(isinstance(cc, t) for t in [list, tuple]):
+            raise TypeError("`cc` must be a list or tuple of length 2")
+
+        sm_ver = f"{cc[0] * 10 + cc[1]}"
+        arch = f"-arch=sm_{sm_ver}"
+        options = [arch]
+        if max_registers:
+            options.append(f"-maxrregcount={max_registers}")
+        if lineinfo:
+            options.append("-lineinfo")
+        if lto:
+            options.append("-lto")
+        if additional_flags is not None:
+            options.extend(additional_flags)
+
+        self._linker = NvJitLinker(*options)
+        self.lto = lto
+        self.options = options
+
+    @property
+    def info_log(self):
+        return self._linker.info_log
+
+    @property
+    def error_log(self):
+        return self._linker.error_log
+
+    def add_ptx(self, ptx, name="<cudapy-ptx>"):
+        self._linker.add_ptx(ptx, name)
+
+    def add_fatbin(self, fatbin, name="<external-fatbin>"):
+        self._linker.add_fatbin(fatbin, name)
+
+    def add_ltoir(self, ltoir, name="<external-ltoir>"):
+        self._linker.add_ltoir(ltoir, name)
+
+    def add_object(self, obj, name="<external-object>"):
+        self._linker.add_object(obj, name)
+
+    def add_file_guess_ext(self, path_or_code):
+        # Numba's add_file_guess_ext expects to always be passed a path to a
+        # file that it will load from the filesystem to link. We augment it
+        # here with the ability to provide a file from memory.
+
+        # To maintain compatibility with the original interface, all strings
+        # are treated as paths in the filesystem.
+        if isinstance(path_or_code, str):
+            # Upstream numba does not yet recognize LTOIR, so handle that
+            # separately here.
+            extension = pathlib.Path(path_or_code).suffix
+            if extension == ".ltoir":
+                self.add_file(path_or_code, "ltoir")
+            else:
+                # Use Numba's logic for non-LTOIR
+                super().add_file_guess_ext(path_or_code)
+
+            return
+
+        # Otherwise, we should have been given a LinkableCode object
+        if not isinstance(path_or_code, LinkableCode):
+            raise TypeError("Expected path to file or a LinkableCode object")
+
+        if path_or_code.kind == "cu":
+            self.add_cu(path_or_code.data, path_or_code.name)
+        else:
+            self.add_data(path_or_code.data, path_or_code.kind, path_or_code.name)
+
+    def add_file(self, path, kind):
+        try:
+            with open(path, "rb") as f:
+                data = f.read()
+        except FileNotFoundError:
+            raise LinkerError(f"{path} not found")
+
+        name = pathlib.Path(path).name
+        self.add_data(data, kind, name)
+
+    def add_data(self, data, kind, name):
+        if kind == FILE_EXTENSION_MAP["cubin"]:
+            fn = self._linker.add_cubin
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            fn = self._linker.add_fatbin
+        elif kind == FILE_EXTENSION_MAP["a"]:
+            fn = self._linker.add_library
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
+            return self.add_ptx(data, name)
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            fn = self._linker.add_object
+        elif kind == "ltoir":
+            fn = self._linker.add_ltoir
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
+
+        try:
+            fn(data, name)
+        except NvJitLinkError as e:
+            raise LinkerError from e
+
+    def add_cu(self, cu, name):
+        with driver.get_active_context() as ac:
+            dev = driver.get_device(ac.devnum)
+            cc = dev.compute_capability
+
+        ptx, log = nvrtc.compile(cu, name, cc)
+
+        if config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % name).center(80, "-"))
+            print(ptx)
+            print("=" * 80)
+
+        # Link the program's PTX using the normal linker mechanism
+        ptx_name = os.path.splitext(name)[0] + ".ptx"
+        self.add_ptx(ptx.encode(), ptx_name)
+
+    def complete(self):
+        try:
+            cubin = self._linker.get_linked_cubin()
+            self._linker._complete = True
+            return cubin
+        except NvJitLinkError as e:
+            raise LinkerError from e
 
 # -----------------------------------------------------------------------------
 

From cbcbbab982b5cae4940959429752f67d1f74ad7d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 29 Jul 2024 07:07:17 -0700
Subject: [PATCH 04/52] pass through kwargs

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 7ec704d..5205355 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -79,7 +79,6 @@ def make_logger():
             logger.addHandler(logging.NullHandler())
     return logger
 
-
 class DeadMemoryError(RuntimeError):
     pass
 
@@ -2594,13 +2593,13 @@ class Linker(metaclass=ABCMeta):
     """Abstract base class for linkers"""
 
     @classmethod
-    def new(cls, max_registers=0, lineinfo=False, cc=None):
+    def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs):
         if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
             # TODO: circular
             from . import runtime
             driver_ver, runtime_ver = driver.get_version(), runtime.get_version()
             if driver_ver >= (12, 0) and runtime_ver > driver_ver:
-                return PatchedLinker(max_registers, lineinfo, cc)
+                return PatchedLinker(max_registers, lineinfo, cc, **kwargs)
             else:
                 return MVCLinker(max_registers, lineinfo, cc)
 

From 4406809eaf2c334ad4ce462d1d6b0426bae72c2d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 29 Jul 2024 07:08:31 -0700
Subject: [PATCH 05/52] patch_cuda once

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 5205355..cb1c89d 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2599,6 +2599,8 @@ def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs):
             from . import runtime
             driver_ver, runtime_ver = driver.get_version(), runtime.get_version()
             if driver_ver >= (12, 0) and runtime_ver > driver_ver:
+                # runs once
+                patch_cuda()
                 return PatchedLinker(max_registers, lineinfo, cc, **kwargs)
             else:
                 return MVCLinker(max_registers, lineinfo, cc)
@@ -3020,6 +3022,18 @@ class LTOIR(LinkableCode):
     default_name = "<unnamed-ltoir>"
 
 
+@functools.lru_cache(maxsize=1)
+def patch_cuda():
+    from numba import cuda
+    cuda.Archive = Archive
+    cuda.CUSource = CUSource
+    cuda.Cubin = Cubin
+    cuda.Fatbin = Fatbin
+    cuda.Object = Object
+    cuda.PTXSource = PTXSource
+    cuda.LTOIR = LTOIR
+
+
 class PatchedLinker(Linker):
     def __init__(
         self,

From dc887b63fbf58e8ded80e87a1bef1f87e3f8dff3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 30 Jul 2024 08:11:26 -0700
Subject: [PATCH 06/52] refactor

---
 numba_cuda/numba/cuda/cudadrv/driver.py       | 105 +-----------------
 .../numba/cuda/cudadrv/linkable_code.py       |  62 +++++++++++
 numba_cuda/numba/cuda/cudadrv/mappings.py     |  22 ++++
 numba_cuda/numba/cuda/device_init.py          |   3 +
 4 files changed, 92 insertions(+), 100 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/cudadrv/linkable_code.py
 create mode 100644 numba_cuda/numba/cuda/cudadrv/mappings.py

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index cb1c89d..5194bba 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -35,6 +35,7 @@
 from .error import CudaSupportError, CudaDriverError
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
+from .mappings import FILE_EXTENSION_MAP
 from numba.cuda.cudadrv import enums, drvapi, nvrtc, _extras
 
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -2568,40 +2569,18 @@ def launch_kernel(cufunc_handle,
                               extra)
 
 
-if USE_NV_BINDING:
-    jitty = binding.CUjitInputType
-    FILE_EXTENSION_MAP = {
-        'o': jitty.CU_JIT_INPUT_OBJECT,
-        'ptx': jitty.CU_JIT_INPUT_PTX,
-        'a': jitty.CU_JIT_INPUT_LIBRARY,
-        'lib': jitty.CU_JIT_INPUT_LIBRARY,
-        'cubin': jitty.CU_JIT_INPUT_CUBIN,
-        'fatbin': jitty.CU_JIT_INPUT_FATBINARY,
-    }
-else:
-    FILE_EXTENSION_MAP = {
-        'o': enums.CU_JIT_INPUT_OBJECT,
-        'ptx': enums.CU_JIT_INPUT_PTX,
-        'a': enums.CU_JIT_INPUT_LIBRARY,
-        'lib': enums.CU_JIT_INPUT_LIBRARY,
-        'cubin': enums.CU_JIT_INPUT_CUBIN,
-        'fatbin': enums.CU_JIT_INPUT_FATBINARY,
-    }
-
-
 class Linker(metaclass=ABCMeta):
     """Abstract base class for linkers"""
 
     @classmethod
-    def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs):
+    def new(cls, max_registers=0, lineinfo=False, cc=None, lto=None, additional_flags=None):
         if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
             # TODO: circular
             from . import runtime
             driver_ver, runtime_ver = driver.get_version(), runtime.get_version()
             if driver_ver >= (12, 0) and runtime_ver > driver_ver:
                 # runs once
-                patch_cuda()
-                return PatchedLinker(max_registers, lineinfo, cc, **kwargs)
+                return PyNvJitLinker(max_registers, lineinfo, cc, lto, additional_flags)
             else:
                 return MVCLinker(max_registers, lineinfo, cc)
 
@@ -2960,81 +2939,7 @@ def complete(self):
         cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
         return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
 
-class LinkableCode:
-    """An object that can be passed in the `link` list argument to `@cuda.jit`
-    kernels to supply code to be linked from memory."""
-
-    def __init__(self, data, name=None):
-        self.data = data
-        self._name = name
-
-    @property
-    def name(self):
-        return self._name or self.default_name
-
-
-class PTXSource(LinkableCode):
-    """PTX Source code in memory"""
-
-    kind = FILE_EXTENSION_MAP["ptx"]
-    default_name = "<unnamed-ptx>"
-
-
-class CUSource(LinkableCode):
-    """CUDA C/C++ Source code in memory"""
-
-    kind = "cu"
-    default_name = "<unnamed-cu>"
-
-
-class Fatbin(LinkableCode):
-    """A fatbin ELF in memory"""
-
-    kind = FILE_EXTENSION_MAP["fatbin"]
-    default_name = "<unnamed-fatbin>"
-
-
-class Cubin(LinkableCode):
-    """A cubin ELF in memory"""
-
-    kind = FILE_EXTENSION_MAP["cubin"]
-    default_name = "<unnamed-cubin>"
-
-
-class Archive(LinkableCode):
-    """An archive of objects in memory"""
-
-    kind = FILE_EXTENSION_MAP["a"]
-    default_name = "<unnamed-archive>"
-
-
-class Object(LinkableCode):
-    """An object file in memory"""
-
-    kind = FILE_EXTENSION_MAP["o"]
-    default_name = "<unnamed-object>"
-
-
-class LTOIR(LinkableCode):
-    """An LTOIR file in memory"""
-
-    kind = "ltoir"
-    default_name = "<unnamed-ltoir>"
-
-
-@functools.lru_cache(maxsize=1)
-def patch_cuda():
-    from numba import cuda
-    cuda.Archive = Archive
-    cuda.CUSource = CUSource
-    cuda.Cubin = Cubin
-    cuda.Fatbin = Fatbin
-    cuda.Object = Object
-    cuda.PTXSource = PTXSource
-    cuda.LTOIR = LTOIR
-
-
-class PatchedLinker(Linker):
+class PyNvJitLinker(Linker):
     def __init__(
         self,
         max_registers=None,
@@ -3049,7 +2954,7 @@ def __init__(
             raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         if cc is None:
-            raise RuntimeError("PatchedLinker requires CC to be specified")
+            raise RuntimeError("PyNvJitLinker requires CC to be specified")
         if not any(isinstance(cc, t) for t in [list, tuple]):
             raise TypeError("`cc` must be a list or tuple of length 2")
 
diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
new file mode 100644
index 0000000..5280829
--- /dev/null
+++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
@@ -0,0 +1,62 @@
+from .mappings import FILE_EXTENSION_MAP
+
+class LinkableCode:
+    """An object that can be passed in the `link` list argument to `@cuda.jit`
+    kernels to supply code to be linked from memory."""
+
+    def __init__(self, data, name=None):
+        self.data = data
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name or self.default_name
+
+
+class PTXSource(LinkableCode):
+    """PTX Source code in memory"""
+
+    kind = FILE_EXTENSION_MAP["ptx"]
+    default_name = "<unnamed-ptx>"
+
+
+class CUSource(LinkableCode):
+    """CUDA C/C++ Source code in memory"""
+
+    kind = "cu"
+    default_name = "<unnamed-cu>"
+
+
+class Fatbin(LinkableCode):
+    """A fatbin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["fatbin"]
+    default_name = "<unnamed-fatbin>"
+
+
+class Cubin(LinkableCode):
+    """A cubin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["cubin"]
+    default_name = "<unnamed-cubin>"
+
+
+class Archive(LinkableCode):
+    """An archive of objects in memory"""
+
+    kind = FILE_EXTENSION_MAP["a"]
+    default_name = "<unnamed-archive>"
+
+
+class Object(LinkableCode):
+    """An object file in memory"""
+
+    kind = FILE_EXTENSION_MAP["o"]
+    default_name = "<unnamed-object>"
+
+
+class LTOIR(LinkableCode):
+    """An LTOIR file in memory"""
+
+    kind = "ltoir"
+    default_name = "<unnamed-ltoir>"
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
new file mode 100644
index 0000000..3750324
--- /dev/null
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -0,0 +1,22 @@
+from numba import config
+from cuda import cuda
+from . import enums
+if config.CUDA_USE_NVIDIA_BINDING:
+    jitty = cuda.CUjitInputType
+    FILE_EXTENSION_MAP = {
+        'o': jitty.CU_JIT_INPUT_OBJECT,
+        'ptx': jitty.CU_JIT_INPUT_PTX,
+        'a': jitty.CU_JIT_INPUT_LIBRARY,
+        'lib': jitty.CU_JIT_INPUT_LIBRARY,
+        'cubin': jitty.CU_JIT_INPUT_CUBIN,
+        'fatbin': jitty.CU_JIT_INPUT_FATBINARY,
+    }
+else:
+    FILE_EXTENSION_MAP = {
+        'o': enums.CU_JIT_INPUT_OBJECT,
+        'ptx': enums.CU_JIT_INPUT_PTX,
+        'a': enums.CU_JIT_INPUT_LIBRARY,
+        'lib': enums.CU_JIT_INPUT_LIBRARY,
+        'cubin': enums.CU_JIT_INPUT_CUBIN,
+        'fatbin': enums.CU_JIT_INPUT_FATBINARY,
+    }
diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py
index 9df5ae9..e435290 100644
--- a/numba_cuda/numba/cuda/device_init.py
+++ b/numba_cuda/numba/cuda/device_init.py
@@ -31,6 +31,9 @@
                                 shfl_xor_sync)
 
 from .kernels import reduction
+from numba.cuda.cudadrv.linkable_code import (
+    Archive, CUSource, Cubin, Fatbin, LinkableCode, LTOIR, Object, PTXSource
+)
 
 reduce = Reduce = reduction.Reduce
 

From b9898ec3a62f52602d66127cad68eec52a3cbd51 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 4 Aug 2024 10:48:52 -0700
Subject: [PATCH 07/52] merge latest/resolve conflicts

---
 .flake8                                       |  52 +
 .github/actions/compute-matrix/action.yaml    |   7 -
 .github/workflows/build.yaml                  |  59 --
 .github/workflows/conda-python-build.yaml     | 125 +++
 .github/workflows/conda-python-tests.yaml     | 155 +++
 .github/workflows/docs-build.yaml             |  77 ++
 .github/workflows/pr.yaml                     |  23 +-
 .github/workflows/publish.yaml                |  44 +
 .github/workflows/wheels-build.yaml           | 174 ++++
 .github/workflows/wheels-test.yaml            | 164 ++++
 .pre-commit-config.yaml                       |   6 +-
 CMakeLists.txt                                |  18 -
 ci/build_conda.sh                             |   7 +-
 ci/build_docs.sh                              |  41 +
 ci/build_wheel.sh                             |   7 +-
 ci/test_conda.sh                              |  10 +-
 ci/test_wheel.sh                              |   8 +-
 ci/upload_conda.sh                            |  14 +
 .../{numba_cuda => numba-cuda}/meta.yaml      |  10 +-
 numba_cuda/VERSION                            |   2 +-
 numba_cuda/numba/cuda/api.py                  |   3 +-
 numba_cuda/numba/cuda/cudadrv/_extras.c       |  49 -
 numba_cuda/numba/cuda/cudadrv/devicearray.py  |   3 +-
 numba_cuda/numba/cuda/cudadrv/driver.py       |  31 +-
 numba_cuda/numba/cuda/cudadrv/drvapi.py       |  12 +-
 numba_cuda/numba/cuda/cudadrv/dummyarray.py   | 452 +++++++++
 numba_cuda/numba/cuda/deviceufunc.py          | 908 ++++++++++++++++++
 numba_cuda/numba/cuda/runtime/nrt.cu          | 107 +++
 .../cuda/simulator/cudadrv/dummyarray.py      |   4 +
 numba_cuda/numba/cuda/target.py               |   1 +
 .../tests/cudapy/test_gufunc_scheduling.py    |   2 +-
 .../numba/cuda/tests/cudapy/test_ipc.py       |   8 +-
 .../cuda/tests/nocuda/test_dummyarray.py      | 359 +++++++
 numba_cuda/numba/cuda/vectorizers.py          |   6 +-
 pyproject.toml                                |  32 +-
 setup.py                                      |  70 ++
 site-packages/_numba_cuda_redirector.py       |   6 +-
 37 files changed, 2832 insertions(+), 224 deletions(-)
 create mode 100644 .flake8
 delete mode 100644 .github/workflows/build.yaml
 create mode 100644 .github/workflows/conda-python-build.yaml
 create mode 100644 .github/workflows/conda-python-tests.yaml
 create mode 100644 .github/workflows/docs-build.yaml
 create mode 100644 .github/workflows/publish.yaml
 create mode 100644 .github/workflows/wheels-build.yaml
 create mode 100644 .github/workflows/wheels-test.yaml
 delete mode 100644 CMakeLists.txt
 create mode 100755 ci/build_docs.sh
 create mode 100755 ci/upload_conda.sh
 rename conda/recipes/{numba_cuda => numba-cuda}/meta.yaml (84%)
 delete mode 100644 numba_cuda/numba/cuda/cudadrv/_extras.c
 create mode 100644 numba_cuda/numba/cuda/cudadrv/dummyarray.py
 create mode 100644 numba_cuda/numba/cuda/deviceufunc.py
 create mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu
 create mode 100644 numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py
 create mode 100644 numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
 create mode 100644 setup.py

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..591e4a2
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,52 @@
+[flake8]
+ignore =
+    # Extra space in brackets
+    E20,
+    # Multiple spaces around ","
+    E231,E241,
+    # Comments
+    E26,
+    # Assigning lambda expression
+    E731,
+    # Ambiguous variable names
+    E741,
+    # line break before binary operator
+    W503,
+    # line break after binary operator
+    W504,
+max-line-length = 80
+
+exclude =
+    __pycache__
+    .git
+    *.pyc
+    *~
+    *.o
+    *.so
+    *.cpp
+    *.c
+    *.h
+
+per-file-ignores =
+    # Slightly long line in the standard version file
+    numba_cuda/_version.py: E501
+    # "Unused" imports / potentially undefined names in init files
+    numba_cuda/numba/cuda/__init__.py:F401,F403,F405
+    numba_cuda/numba/cuda/simulator/__init__.py:F401,F403
+    numba_cuda/numba/cuda/simulator/cudadrv/__init__.py:F401
+    # Ignore star imports, unused imports, and "may be defined by star imports"
+    # errors in device_init because its purpose is to bring together a lot of
+    # the public API to be star-imported in numba.cuda.__init__
+    numba_cuda/numba/cuda/device_init.py:F401,F403,F405
+    # libdevice.py is an autogenerated file containing stubs for all the device
+    # functions. Some of the lines in docstrings are a little over-long, as they
+    # contain the URLs of the reference pages in the online libdevice
+    # documentation.
+    numba_cuda/numba/cuda/libdevice.py:E501
+    # Ignore too-long lines in the doc examples, prioritising readability
+    # in the docs over line length in the example source (especially given that
+    # the test code is already indented by 8 spaces)
+    numba_cuda/numba/cuda/tests/doc_examples/test_random.py:E501
+    numba_cuda/numba/cuda/tests/doc_examples/test_cg.py:E501
+    numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py:E501
+    numba_cuda/numba/tests/doc_examples/test_interval_example.py:E501
diff --git a/.github/actions/compute-matrix/action.yaml b/.github/actions/compute-matrix/action.yaml
index dab011d..3dd73bc 100644
--- a/.github/actions/compute-matrix/action.yaml
+++ b/.github/actions/compute-matrix/action.yaml
@@ -16,14 +16,7 @@ runs:
         set -eo pipefail
 
         export BUILD_MATRIX="
-        - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'rockylinux8' }
         - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'rockylinux8' }
-        - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'rockylinux8' }
         "
 
         export TEST_MATRIX="
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
deleted file mode 100644
index 00458da..0000000
--- a/.github/workflows/build.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-name: build
-
-on:
-  push:
-    tags:
-      - 'v*'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-release
-  cancel-in-progress: true
-
-jobs:
-  compute-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Compute Build Matrix
-        id: compute-matrix
-        uses: ./.github/actions/compute-matrix
-  build-wheels:
-    needs:
-      - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
-    with:
-      build_type: branch
-      script: "ci/build_wheel.sh"
-      matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
-  build-conda:
-    needs:
-      - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
-    with:
-      build_type: branch
-      script: "ci/build_conda.sh"
-      matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
-  publish-wheels:
-    needs:
-      - build-wheels
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-      package-name: numba_cuda
-  publish-conda:
-    needs:
-     - build-conda
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml
new file mode 100644
index 0000000..75e1028
--- /dev/null
+++ b/.github/workflows/conda-python-build.yaml
@@ -0,0 +1,125 @@
+on:
+  workflow_call:
+    inputs:
+      build_type:
+        required: true
+        type: string
+      branch:
+        type: string
+      date:
+        type: string
+      sha:
+        type: string
+      repo:
+        type: string
+      node_type:
+        type: string
+        default: "cpu8"
+      script:
+        type: string
+        default: "ci/build_python.sh"
+      matrix_filter:
+        type: string
+        default: "."
+      upload_to_anaconda:
+        type: boolean
+        required: false
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Compute Build Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+          #
+          export MATRIX="
+          # amd64
+          - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          # arm64
+          - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' }
+          "
+
+          MATRIX="$(
+            yq -n -o json 'env(MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  build:
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}"
+    env:
+      RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
+    container:
+      image: rapidsai/ci-conda:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}
+      env:
+        RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - name: Standardize repository information
+        run: |
+          echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}"
+          echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}"
+      - name: Python build
+        run: ${{ inputs.script }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+      - name: Upload conda repo
+        if: "!cancelled()"
+        uses: actions/upload-artifact@v4
+        with:
+          name: conda-repo
+          path: "/tmp/conda-bld-output"
+      - name: Publish conda package
+        if: inputs.upload_to_anaconda
+        run: "ci/upload_conda.sh"
+        env:
+          CONDA_TOKEN: ${{ secrets.NVIDIA_CONDA_TOKEN }}
diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
new file mode 100644
index 0000000..ba9390e
--- /dev/null
+++ b/.github/workflows/conda-python-tests.yaml
@@ -0,0 +1,155 @@
+on:
+  workflow_call:
+    inputs:
+      build_type:
+        required: true
+        type: string
+      branch:
+        type: string
+      date:
+        type: string
+      sha:
+        type: string
+      repo:
+        type: string
+      script:
+        type: string
+        default: "ci/test_python.sh"
+      run_codecov:
+        type: boolean
+        default: true
+      matrix_filter:
+        type: string
+        default: "."
+      container-options:
+        required: false
+        type: string
+        default: "-e _NOOP"
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    env:
+      BUILD_TYPE: ${{ inputs.build_type }}
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Validate Test Type
+        run: |
+          if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]]; then
+              echo "Invalid build type! Must be 'nightly' or 'pull-request'."
+              exit 1
+          fi
+      - name: Compute Python Test Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER]
+          #
+          export MATRICES="
+            pull-request:
+              # amd64
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'earliest' }
+              - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest'   }
+              # arm64
+              - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest'   }
+            nightly:
+              # amd64
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.4.3', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'earliest' }
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.4.3', LINUX_VER: 'ubuntu20.04', GPU: 'v100', DRIVER: 'latest'   }
+              - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest'   }
+              - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'latest'   }
+              - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest'   }
+              # arm64
+              - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '11.4.3', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest'   }
+              - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest'   }
+              - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest'   }
+              - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'latest'   }
+          "
+
+          TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(BUILD_TYPE)]')
+          export TEST_MATRIX
+
+          MATRIX="$(
+            yq -n -o json 'env(TEST_MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  tests:
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1"
+    env:
+      RAPIDS_COVERAGE_DIR: ${{ github.workspace }}/coverage-results
+      RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results
+      RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
+    container:
+      image: rapidsai/ci-conda:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}
+      options: ${{ inputs.container-options }}
+      env:
+        RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - uses: actions/download-artifact@v4
+        name: conda-repo
+      - name: Display structure of downloaded files
+        run: ls -R
+      - name: Standardize repository information
+        run: |
+          echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}"
+          echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}"
+      - name: Python tests
+        run: ${{ inputs.script }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+      - name: Generate test report
+        uses: test-summary/action@v2.3
+        with:
+          paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml"
+        if: always()
+      - name: Run codecov
+        if: inputs.run_codecov
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        run: |
+          codecovcli \
+            -v \
+            upload-process \
+            -C ${{ github.sha }} \
+            -s "${RAPIDS_COVERAGE_DIR}" \
+            --handle-no-reports-found
+      - name: Upload additional artifacts
+        if: "!cancelled()"
+        run: rapids-upload-artifacts-dir cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)_py${RAPIDS_PY_VERSION//.}
diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
new file mode 100644
index 0000000..476e5b2
--- /dev/null
+++ b/.github/workflows/docs-build.yaml
@@ -0,0 +1,77 @@
+on:
+  workflow_call:
+    inputs:
+      build_type:
+        required: true
+        type: string
+      branch:
+        type: string
+      date:
+        type: string
+      sha:
+        type: string
+      repo:
+        type: string
+      node_type:
+        type: string
+        default: "cpu8"
+      script:
+        type: string
+        default: "ci/build_docs.sh"
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+    runs-on: "linux-amd64-cpu4"
+    env:
+      RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
+    container:
+      image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11
+      env:
+        RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - uses: actions/download-artifact@v4
+        name: conda-repo
+      - name: Display structure of downloaded files
+        run: ls -R
+      - name: Standardize repository information
+        run: |
+          echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}"
+          echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}"
+          echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}"
+      - name: Docs build
+        run: ${{ inputs.script }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+      - name: Upload docs
+        if: "!cancelled()"
+        uses: actions/upload-artifact@v4
+        with:
+          name: docs
+          path: ${{ github.workspace }}/docs/build/html
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9adf087..7498eb1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -1,4 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 name: pr
 
 on:
@@ -19,11 +21,12 @@ jobs:
       - test-conda
       - build-wheels
       - test-wheels
+      - build-docs
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   compute-matrix:
@@ -39,7 +42,7 @@ jobs:
   build-conda:
     needs:
       - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: ./.github/workflows/conda-python-build.yaml
     with:
       build_type: pull-request
       script: "ci/build_conda.sh"
@@ -48,7 +51,7 @@ jobs:
     needs:
       - build-conda
       - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: ./.github/workflows/conda-python-tests.yaml
     with:
       build_type: pull-request
       script: "ci/test_conda.sh"
@@ -57,7 +60,7 @@ jobs:
   build-wheels:
     needs:
       - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: ./.github/workflows/wheels-build.yaml
     with:
       build_type: pull-request
       script: "ci/build_wheel.sh"
@@ -66,8 +69,14 @@ jobs:
     needs:
       - build-wheels
       - compute-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: ./.github/workflows/wheels-test.yaml
     with:
       build_type: pull-request
       script: "ci/test_wheel.sh"
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
+  build-docs:
+    needs:
+      - build-conda
+    uses: ./.github/workflows/docs-build.yaml
+    with:
+      build_type: pull-request
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
new file mode 100644
index 0000000..986a693
--- /dev/null
+++ b/.github/workflows/publish.yaml
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Publish packages
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-release
+  cancel-in-progress: true
+
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }}
+      TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Compute Build Matrix
+        id: compute-matrix
+        uses: ./.github/actions/compute-matrix
+  build-conda:
+    needs: compute-matrix
+    uses: ./.github/workflows/conda-python-build.yaml
+    secrets: inherit
+    with:
+      build_type: release
+      script: "ci/build_conda.sh"
+      matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
+      upload_to_anaconda: true
+  build-wheels:
+    needs: compute-matrix
+    uses: ./.github/workflows/wheels-build.yaml
+    secrets: inherit
+    with:
+      build_type: release
+      script: "ci/build_wheel.sh"
+      matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
+      upload_to_pypi: true
diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml
new file mode 100644
index 0000000..c1d20ec
--- /dev/null
+++ b/.github/workflows/wheels-build.yaml
@@ -0,0 +1,174 @@
+name: Build RAPIDS wheels
+
+on:
+  workflow_call:
+    inputs:
+      # repo and branch
+      repo:
+        type: string
+      branch:
+        type: string
+      date:
+        type: string
+      sha:
+        type: string
+      build_type:
+        required: true
+        type: string
+      script:
+        required: true
+        type: string
+
+      # allow a bigger runner instance
+      node_type:
+        required: false
+        type: string
+        default: "cpu8"
+
+      # general settings
+      matrix_filter:
+        type: string
+        default: "."
+
+      # Extra repository that will be cloned into the project directory.
+      extra-repo:
+        required: false
+        type: string
+        default: ''
+      extra-repo-sha:
+        required: false
+        type: string
+        default: ''
+      # Note that this is the _name_ of a secret containing the key, not the key itself.
+      extra-repo-deploy-key:
+        required: false
+        type: string
+        default: ''
+
+      upload_to_pypi:
+        type: boolean
+        required: false
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+
+  compute-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Compute Build Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+          #
+          export MATRIX="
+          # amd64
+          - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          # arm64
+          - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+          - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' }
+          "
+
+          MATRIX="$(
+            yq -n -o json 'env(MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  build:
+    name:  ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}
+    needs: [compute-matrix]
+    strategy:
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}"
+    env:
+      RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
+    container:
+      image: "rapidsai/ci-wheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
+      env:
+        RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+
+    steps:
+      - name: checkout code repo
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0 # unshallow fetch for setuptools-scm
+          persist-credentials: false
+
+      - name: Standardize repository information
+        uses: rapidsai/shared-actions/rapids-github-info@main
+        with:
+          repo: ${{ inputs.repo }}
+          branch: ${{ inputs.branch }}
+          date: ${{ inputs.date }}
+          sha: ${{ inputs.sha }}
+
+      - name: Preprocess extra repos
+        id: preprocess-extras
+        if: ${{ inputs.extra-repo != '' }}
+        run: |
+          EXTRA_REPO_PATH=$(echo ${{ inputs.extra-repo }} | cut -d "/"  -f 2)
+          echo "EXTRA_REPO_PATH=${EXTRA_REPO_PATH}" >> $GITHUB_OUTPUT
+
+      - name: checkout extra repos
+        uses: actions/checkout@v4
+        if: ${{ inputs.extra-repo != '' }}
+        with:
+          repository: ${{ inputs.extra-repo }}
+          ref: ${{ inputs.extra-repo-sha }}
+          path: "./${{ steps.preprocess-extras.outputs.EXTRA_REPO_PATH }}"
+          ssh-key: ${{ secrets[inputs.extra-repo-deploy-key] }}
+          persist-credentials: false
+
+      - name: Build and repair the wheel
+        run: |
+          ${{ inputs.script }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        # Use a shell that loads the rc file so that we get the compiler settings
+        shell: bash -leo pipefail {0}
+      - name: Upload wheel
+        if: "!cancelled()"
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel
+          path: ${{ env.package_path }}
+      - name: Publish wheel
+        if: inputs.upload_to_pypi
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
new file mode 100644
index 0000000..10acf54
--- /dev/null
+++ b/.github/workflows/wheels-test.yaml
@@ -0,0 +1,164 @@
+name: Test RAPIDS wheels
+
+on:
+  workflow_call:
+    inputs:
+      # repo and branch
+      repo:
+        type: string
+      branch:
+        type: string
+      date:
+        type: string
+      sha:
+        type: string
+      build_type:
+        required: true
+        type: string
+      script:
+        type: string
+        default: "ci/test_wheel.sh"
+      matrix_filter:
+        type: string
+        default: "."
+      container-options:
+        required: false
+        type: string
+        default: "-e _NOOP"
+      test_summary_show:
+        required: false
+        type: string
+        default: "fail"
+    # the use of secrets in shared-workflows is discouraged, especially for public repositories.
+    # these values were added for situations where the use of secrets is unavoidable.
+    secrets:
+      RAPIDS_AUX_SECRET_1:
+        required: false
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    env:
+      BUILD_TYPE: ${{ inputs.build_type }}
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Validate test type
+        run: |
+          if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]]; then
+              echo "Invalid build type! Must be 'nightly' or 'pull-request'."
+              exit 1
+          fi
+      - name: Compute test matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER]
+          #
+          export MATRICES="
+            pull-request:
+              # amd64
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' }
+              # arm64
+              - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' }
+            nightly:
+              # amd64
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' }
+              - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' }
+              - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' }
+              - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' }
+              - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' }
+              - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' }
+              # arm64
+              - { ARCH: 'arm64', PY_VER: '3.9',  CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' }
+              - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' }
+              - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' }
+              - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' }
+          "
+
+          TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(BUILD_TYPE)]')
+          export TEST_MATRIX
+
+          MATRIX="$(
+            yq -n -o json 'env(TEST_MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  test:
+    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}, ${{ matrix.gpu }}
+    needs: compute-matrix
+    env:
+      RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results
+      RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.gpu }}-${{ matrix.driver }}-1"
+    container:
+      image: "rapidsai/citestwheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
+      options: ${{ inputs.container-options }}
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} # GPU jobs must set this container env variable
+        RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+    steps:
+    - name: Run nvidia-smi to make sure GPU is working
+      run: nvidia-smi
+
+    - name: checkout code repo
+      uses: actions/checkout@v4
+      with:
+        repository: ${{ inputs.repo }}
+        ref: ${{ inputs.sha }}
+        fetch-depth: 0 # unshallow fetch for setuptools-scm
+        persist-credentials: false
+    - uses: actions/download-artifact@v4
+      name: wheel
+    - name: Display structure of downloaded files
+      run: ls -R
+    - name: Standardize repository information
+      uses: rapidsai/shared-actions/rapids-github-info@main
+      with:
+        repo: ${{ inputs.repo }}
+        branch: ${{ inputs.branch }}
+        date: ${{ inputs.date }}
+        sha: ${{ inputs.sha }}
+
+    - name: Run tests
+      run: ${{ inputs.script }}
+      env:
+        GH_TOKEN: ${{ github.token }}
+        RAPIDS_AUX_SECRET_1: ${{ secrets.RAPIDS_AUX_SECRET_1 }}
+
+    - name: Generate test report
+      uses: test-summary/action@v2.3
+      with:
+        paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml"
+        show: ${{ inputs.test_summary_show }}
+      if: always()
+
+    - name: Upload additional artifacts
+      if: "!cancelled()"
+      run: rapids-upload-artifacts-dir cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)_py${RAPIDS_PY_VERSION//.}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4e6a92c..0a114cd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1 +1,5 @@
-repos: []
+repos:
+- repo: https://github.com/PyCQA/flake8
+  rev: 7.1.0
+  hooks:
+  - id: flake8
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index acd6ac1..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
-
-project(
-  numba_cuda
-  VERSION ${SKBUILD_PROJECT_VERSION}
-  LANGUAGES C
-)
-
-find_package(Python COMPONENTS Interpreter Development REQUIRED)
-
-Python_add_library(_extras MODULE numba_cuda/numba/cuda/cudadrv/_extras.c WITH_SOABI)
-
-target_compile_options(_extras PRIVATE -Werror -Wall)
-
-install(TARGETS _extras LIBRARY DESTINATION numba_cuda/numba/cuda/cudadrv)
-install(FILES site-packages/_numba_cuda_redirector.py
-              site-packages/_numba_cuda_redirector.pth
-              DESTINATION ${SKBUILD_PLATLIB_DIR})
diff --git a/ci/build_conda.sh b/ci/build_conda.sh
index 6c170eb..c4b86be 100755
--- a/ci/build_conda.sh
+++ b/ci/build_conda.sh
@@ -9,12 +9,11 @@ source rapids-configure-sccache
 
 source rapids-date-string
 
-export CMAKE_GENERATOR=Ninja
-
 rapids-print-env
 
 rapids-logger "Begin py build"
 
-rapids-conda-retry mambabuild conda/recipes/numba_cuda
+rapids-conda-retry mambabuild conda/recipes/numba-cuda
 
-rapids-upload-conda-to-s3 python
+package_path=(/tmp/conda-bld-output/noarch/numba-cuda-*.tar.bz2)
+echo "package_path=$package_path" >> $GITHUB_ENV
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
new file mode 100755
index 0000000..9742c8b
--- /dev/null
+++ b/ci/build_docs.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-logger "Install docs dependencies"
+# TODO: Replace with rapids-dependency-file-generator
+rapids-mamba-retry create -n docs \
+    make \
+    psutil \
+    sphinx \
+    sphinx_rtd_theme \
+    numpydoc \
+    python=${RAPIDS_PY_VERSION}
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate docs
+set -u
+
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+
+rapids-print-env
+
+rapids-logger "Show Numba system info"
+python -m numba --sysinfo
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
+rapids-logger "Build docs"
+pushd docs
+make html
+
+popd
+
+rapids-logger "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 688431e..34930f1 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -7,7 +7,6 @@ rapids-logger "Build wheel"
 mkdir -p ./dist
 python -m pip wheel . --wheel-dir=./dist -vvv --disable-pip-version-check --no-deps
 
-python -m auditwheel repair -w ./final_dist ./dist/*
-
-rapids-logger "Upload Wheel"
-RAPIDS_PY_WHEEL_NAME="numba_cuda" rapids-upload-wheels-to-s3 ./final_dist
+package_path=$(realpath ./dist/numba_cuda-*.whl)
+echo "Package path: $package_path"
+echo "package_path=$package_path" >> $GITHUB_ENV
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 702cbcd..5861128 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -10,11 +10,10 @@ rapids-logger "Install testing dependencies"
 rapids-mamba-retry create -n test \
     c-compiler \
     cxx-compiler \
-    cuda-nvcc \
+    cuda-nvcc-impl \
     cuda-nvrtc \
     cuda-python \
     cuda-version=${RAPIDS_CUDA_VERSION%.*} \
-    "numba>=0.59.1" \
     make \
     psutil \
     pytest \
@@ -25,17 +24,14 @@ set +u
 conda activate test
 set -u
 
-PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 pushd "${RAPIDS_TESTS_DIR}"
 
 rapids-print-env
 
-rapids-mamba-retry install \
-  --channel "${PYTHON_CHANNEL}" \
-  numba_cuda
-
 rapids-logger "Check GPU usage"
 nvidia-smi
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 149455a..5ffd014 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -6,16 +6,14 @@ set -euo pipefail
 rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator
 python -m pip install \
-    "numba>=0.59.1" \
     psutil \
     cuda-python \
     pytest
 
-rapids-logger "Download Wheel"
-RAPIDS_PY_WHEEL_NAME="numba_cuda" rapids-download-wheels-from-s3 ./dist/
-
 rapids-logger "Install wheel"
-python -m pip install $(echo ./dist/numba_cuda*.whl)
+package=$(realpath wheel/numba_cuda*.whl)
+echo "Package path: $package"
+python -m pip install $package
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/upload_conda.sh b/ci/upload_conda.sh
new file mode 100755
index 0000000..1ac266e
--- /dev/null
+++ b/ci/upload_conda.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+PKGS_TO_UPLOAD=$(rapids-find-anaconda-uploads.py $RAPIDS_CONDA_BLD_OUTPUT_DIR)
+
+rapids-retry anaconda \
+    -t $CONDA_TOKEN \
+    upload \
+    --skip-existing \
+    --no-progress \
+    ${PKGS_TO_UPLOAD}
diff --git a/conda/recipes/numba_cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
similarity index 84%
rename from conda/recipes/numba_cuda/meta.yaml
rename to conda/recipes/numba-cuda/meta.yaml
index d510fae..a74ae34 100644
--- a/conda/recipes/numba_cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -9,26 +9,22 @@
 {% set project_urls = project_data["urls"] %}
 
 package:
-  name: numba_cuda
+  name: numba-cuda
   version: {{ version }}
 
 source:
   path: ../../..
 
 build:
+  noarch: python
   script:
     - {{ PYTHON }} -m pip install . -vv
 
 requirements:
-  build:
-    - {{ compiler('c') }}
-    - cmake >=3.26.4
-    - ninja
-    - sysroot_{{ target_platform }} 2.17
   host:
     - python
     - pip
-    - scikit-build-core
+    - setuptools
   run:
     - python
     - numba >=0.59.1
diff --git a/numba_cuda/VERSION b/numba_cuda/VERSION
index 8acdd82..43b2961 100644
--- a/numba_cuda/VERSION
+++ b/numba_cuda/VERSION
@@ -1 +1 @@
-0.0.1
+0.0.13
diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py
index 4384749..5dfe7c4 100644
--- a/numba_cuda/numba/cuda/api.py
+++ b/numba_cuda/numba/cuda/api.py
@@ -239,7 +239,8 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
         driver_handle = driver.binding.CUipcMemHandle()
         driver_handle.reserved = handle
     else:
-        driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
+        driver_handle = driver.drvapi.cu_ipc_mem_handle()
+        driver_handle.reserved[:] = handle
     # use *IpcHandle* to open the IPC memory
     ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
     yield ipchandle.open_array(current_context(), shape=shape,
diff --git a/numba_cuda/numba/cuda/cudadrv/_extras.c b/numba_cuda/numba/cuda/cudadrv/_extras.c
deleted file mode 100644
index 8652083..0000000
--- a/numba_cuda/numba/cuda/cudadrv/_extras.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Helper binding to call some CUDA Runtime API that cannot be directly
- * encoded using ctypes.
- */
-
-#include "Python.h"
-
-#define CUDA_IPC_HANDLE_SIZE 64
-
-typedef int CUresult;
-typedef void* CUdeviceptr;
-
-typedef struct CUipcMemHandle_st{
-    char reserved[CUDA_IPC_HANDLE_SIZE];
-} CUipcMemHandle;
-
-typedef CUresult (*cuIpcOpenMemHandle_t)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int flags );
-
-static
-cuIpcOpenMemHandle_t cuIpcOpenMemHandle = 0;
-
-static
-void set_cuIpcOpenMemHandle(void* fnptr)
-{
-    cuIpcOpenMemHandle = (cuIpcOpenMemHandle_t)fnptr;
-}
-
-static
-CUresult call_cuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle* handle, unsigned int flags)
-{
-    return cuIpcOpenMemHandle(pdptr, *handle, flags);
-}
-
-
-PyMODINIT_FUNC PyInit__extras(void) {
-    static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT, "_extras", "No docs", -1, NULL, NULL, NULL, NULL, NULL
-    };
-
-    PyObject *m = PyModule_Create(&moduledef);
-
-    if (m == NULL)
-        return NULL;
-
-    PyModule_AddObject(m, "set_cuIpcOpenMemHandle", PyLong_FromVoidPtr(&set_cuIpcOpenMemHandle));
-    PyModule_AddObject(m, "call_cuIpcOpenMemHandle", PyLong_FromVoidPtr(&call_cuIpcOpenMemHandle));
-    PyModule_AddIntConstant(m, "CUDA_IPC_HANDLE_SIZE", CUDA_IPC_HANDLE_SIZE);
-    return m;
-}
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 5d9c4af..90f4070 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -14,12 +14,11 @@
 
 import numba
 from numba import _devicearray
-from numba.cuda.cudadrv import devices
+from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types, config
 from numba.np.unsafe.ndarray import to_fixed_tuple
 from numba.np.numpy_support import numpy_version
-from numba.misc import dummyarray
 from numba.np import numpy_support
 from numba.cuda.api_util import prepare_shape_strides_dtype
 from numba.core.errors import NumbaPerformanceWarning
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 5194bba..d9d1ded 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -36,7 +36,7 @@
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
-from numba.cuda.cudadrv import enums, drvapi, nvrtc, _extras
+from numba.cuda.cudadrv import enums, drvapi, nvrtc
 
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
 
@@ -259,29 +259,6 @@ def ensure_initialized(self):
         else:
             self.pid = _getpid()
 
-        self._initialize_extras()
-
-    def _initialize_extras(self):
-        if USE_NV_BINDING:
-            # The extras are only needed when using Numba's ctypes bindings
-            return
-
-        # set pointer to original cuIpcOpenMemHandle
-        set_proto = ctypes.CFUNCTYPE(None, c_void_p)
-        set_cuIpcOpenMemHandle = set_proto(_extras.set_cuIpcOpenMemHandle)
-        set_cuIpcOpenMemHandle(self._find_api('cuIpcOpenMemHandle'))
-        # bind caller to cuIpcOpenMemHandle that fixes the ABI
-        call_proto = ctypes.CFUNCTYPE(c_int,
-                                      ctypes.POINTER(drvapi.cu_device_ptr),
-                                      ctypes.POINTER(drvapi.cu_ipc_mem_handle),
-                                      ctypes.c_uint)
-        call_cuIpcOpenMemHandle = call_proto(_extras.call_cuIpcOpenMemHandle)
-        call_cuIpcOpenMemHandle.__name__ = 'call_cuIpcOpenMemHandle'
-        safe_call = self._ctypes_wrap_fn('call_cuIpcOpenMemHandle',
-                                         call_cuIpcOpenMemHandle)
-        # override cuIpcOpenMemHandle
-        self.cuIpcOpenMemHandle = safe_call
-
     @property
     def is_available(self):
         self.ensure_initialized()
@@ -1878,7 +1855,7 @@ def __reduce__(self):
         if USE_NV_BINDING:
             preprocessed_handle = self.handle.reserved
         else:
-            preprocessed_handle = tuple(self.handle)
+            preprocessed_handle = tuple(self.handle.reserved)
         args = (
             self.__class__,
             preprocessed_handle,
@@ -1892,9 +1869,9 @@ def __reduce__(self):
     def _rebuild(cls, handle_ary, size, source_info, offset):
         if USE_NV_BINDING:
             handle = binding.CUipcMemHandle()
-            handle.reserved = handle_ary
         else:
-            handle = drvapi.cu_ipc_mem_handle(*handle_ary)
+            handle = drvapi.cu_ipc_mem_handle()
+        handle.reserved = handle_ary
         return cls(base=None, handle=handle, size=size,
                    source_info=source_info, offset=offset)
 
diff --git a/numba_cuda/numba/cuda/cudadrv/drvapi.py b/numba_cuda/numba/cuda/cudadrv/drvapi.py
index cbbd792..7f6dfbb 100644
--- a/numba_cuda/numba/cuda/cudadrv/drvapi.py
+++ b/numba_cuda/numba/cuda/cudadrv/drvapi.py
@@ -1,7 +1,6 @@
 from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
-                    c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
-
-from numba.cuda.cudadrv import _extras
+                    c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER,
+                    Structure)
 
 cu_device = c_int
 cu_device_attribute = c_int     # enum
@@ -15,13 +14,18 @@
 cu_event = c_void_p
 cu_link_state = c_void_p
 cu_function_attribute = c_int
-cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE)   # 64 bytes wide
 cu_uuid = (c_byte * 16)         # Device UUID
 
 cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
 
 cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
 
+
+# Mirrors the definition of CUipcMemHandle in cuda.h
+class cu_ipc_mem_handle(Structure):
+    _fields_ = [("reserved", c_uint8 * 64)]
+
+
 # See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
 CU_STREAM_DEFAULT = 0
 CU_STREAM_LEGACY = 1
diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
new file mode 100644
index 0000000..38e1b89
--- /dev/null
+++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
@@ -0,0 +1,452 @@
+from collections import namedtuple
+import itertools
+import functools
+import operator
+import ctypes
+
+import numpy as np
+
+from numba import _helperlib
+
+Extent = namedtuple("Extent", ["begin", "end"])
+
+attempt_nocopy_reshape = ctypes.CFUNCTYPE(
+    ctypes.c_int,
+    ctypes.c_long,  # nd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # dims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # strides
+    ctypes.c_long,  # newnd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newdims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newstrides
+    ctypes.c_long,  # itemsize
+    ctypes.c_int,  # is_f_order
+)(_helperlib.c_helpers['attempt_nocopy_reshape'])
+
+
+class Dim(object):
+    """A single dimension of the array
+
+    Attributes
+    ----------
+    start:
+        start offset
+    stop:
+        stop offset
+    size:
+        number of items
+    stride:
+        item stride
+    """
+    __slots__ = 'start', 'stop', 'size', 'stride', 'single'
+
+    def __init__(self, start, stop, size, stride, single):
+        self.start = start
+        self.stop = stop
+        self.size = size
+        self.stride = stride
+        self.single = single
+        assert not single or size == 1
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            start, stop, step = item.indices(self.size)
+            stride = step * self.stride
+            start = self.start + start * abs(self.stride)
+            stop = self.start + stop * abs(self.stride)
+            if stride == 0:
+                size = 1
+            else:
+                size = _compute_size(start, stop, stride)
+            ret = Dim(
+                start=start,
+                stop=stop,
+                size=size,
+                stride=stride,
+                single=False
+            )
+            return ret
+        else:
+            sliced = self[item:item + 1] if item != -1 else self[-1:]
+            if sliced.size != 1:
+                raise IndexError
+            return Dim(
+                start=sliced.start,
+                stop=sliced.stop,
+                size=sliced.size,
+                stride=sliced.stride,
+                single=True,
+            )
+
+    def get_offset(self, idx):
+        return self.start + idx * self.stride
+
+    def __repr__(self):
+        strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
+        return strfmt % (self.start, self.stop, self.size, self.stride)
+
+    def normalize(self, base):
+        return Dim(start=self.start - base, stop=self.stop - base,
+                   size=self.size, stride=self.stride, single=self.single)
+
+    def copy(self, start=None, stop=None, size=None, stride=None, single=None):
+        if start is None:
+            start = self.start
+        if stop is None:
+            stop = self.stop
+        if size is None:
+            size = self.size
+        if stride is None:
+            stride = self.stride
+        if single is None:
+            single = self.single
+        return Dim(start, stop, size, stride, single)
+
+    def is_contiguous(self, itemsize):
+        return self.stride == itemsize
+
+
+def compute_index(indices, dims):
+    return sum(d.get_offset(i) for i, d in zip(indices, dims))
+
+
+class Element(object):
+    is_array = False
+
+    def __init__(self, extent):
+        self.extent = extent
+
+    def iter_contiguous_extent(self):
+        yield self.extent
+
+
+class Array(object):
+    """A dummy numpy array-like object.  Consider it an array without the
+    actual data, but offset from the base data pointer.
+
+    Attributes
+    ----------
+    dims: tuple of Dim
+        describing each dimension of the array
+
+    ndim: int
+        number of dimension
+
+    shape: tuple of int
+        size of each dimension
+
+    strides: tuple of int
+        stride of each dimension
+
+    itemsize: int
+        itemsize
+
+    extent: (start, end)
+        start and end offset containing the memory region
+    """
+    is_array = True
+
+    @classmethod
+    def from_desc(cls, offset, shape, strides, itemsize):
+        dims = []
+        for ashape, astride in zip(shape, strides):
+            dim = Dim(offset, offset + ashape * astride, ashape, astride,
+                      single=False)
+            dims.append(dim)
+            offset = 0  # offset only applies to first dimension
+        return cls(dims, itemsize)
+
+    def __init__(self, dims, itemsize):
+        self.dims = tuple(dims)
+        self.ndim = len(self.dims)
+        self.shape = tuple(dim.size for dim in self.dims)
+        self.strides = tuple(dim.stride for dim in self.dims)
+        self.itemsize = itemsize
+        self.size = functools.reduce(operator.mul, self.shape, 1)
+        self.extent = self._compute_extent()
+        self.flags = self._compute_layout()
+
+    def _compute_layout(self):
+        # The logic here is based on that in _UpdateContiguousFlags from
+        # numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
+        # 13661ac70).
+        # https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
+
+        # Records have no dims, and we can treat them as contiguous
+        if not self.dims:
+            return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # If this is a broadcast array then it is not contiguous
+        if any([dim.stride == 0 for dim in self.dims]):
+            return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
+
+        flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # Check C contiguity
+        sd = self.itemsize
+        for dim in reversed(self.dims):
+            if dim.size == 0:
+                # Contiguous by definition
+                return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['C_CONTIGUOUS'] = False
+                sd *= dim.size
+
+        # Check F contiguity
+        sd = self.itemsize
+        for dim in self.dims:
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['F_CONTIGUOUS'] = False
+                    return flags
+                sd *= dim.size
+
+        return flags
+
+    def _compute_extent(self):
+        firstidx = [0] * self.ndim
+        lastidx = [s - 1 for s in self.shape]
+        start = compute_index(firstidx, self.dims)
+        stop = compute_index(lastidx, self.dims) + self.itemsize
+        stop = max(stop, start)   # ensure positive extent
+        return Extent(start, stop)
+
+    def __repr__(self):
+        return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
+
+    def __getitem__(self, item):
+        if not isinstance(item, tuple):
+            item = [item]
+        else:
+            item = list(item)
+
+        nitem = len(item)
+        ndim = len(self.dims)
+        if nitem > ndim:
+            raise IndexError("%d extra indices given" % (nitem - ndim,))
+
+        # Add empty slices for missing indices
+        while len(item) < ndim:
+            item.append(slice(None, None))
+
+        dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
+        newshape = [d.size for d in dims if not d.single]
+
+        arr = Array(dims, self.itemsize)
+        if newshape:
+            return arr.reshape(*newshape)[0]
+        else:
+            return Element(arr.extent)
+
+    @property
+    def is_c_contig(self):
+        return self.flags['C_CONTIGUOUS']
+
+    @property
+    def is_f_contig(self):
+        return self.flags['F_CONTIGUOUS']
+
+    def iter_contiguous_extent(self):
+        """ Generates extents
+        """
+        if self.is_c_contig or self.is_f_contig:
+            yield self.extent
+        else:
+            if self.dims[0].stride < self.dims[-1].stride:
+                innerdim = self.dims[0]
+                outerdims = self.dims[1:]
+                outershape = self.shape[1:]
+            else:
+                innerdim = self.dims[-1]
+                outerdims = self.dims[:-1]
+                outershape = self.shape[:-1]
+
+            if innerdim.is_contiguous(self.itemsize):
+                oslen = [range(s) for s in outershape]
+                for indices in itertools.product(*oslen):
+                    base = compute_index(indices, outerdims)
+                    yield base + innerdim.start, base + innerdim.stop
+            else:
+                oslen = [range(s) for s in self.shape]
+                for indices in itertools.product(*oslen):
+                    offset = compute_index(indices, self.dims)
+                    yield offset, offset + self.itemsize
+
+    def reshape(self, *newdims, **kws):
+        oldnd = self.ndim
+        newnd = len(newdims)
+
+        if newdims == self.shape:
+            return self, None
+
+        order = kws.pop('order', 'C')
+        if kws:
+            raise TypeError('unknown keyword arguments %s' % kws.keys())
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        # check for exactly one instance of -1 in newdims
+        # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515   # noqa: E501
+        unknownidx = -1
+        knownsize = 1
+        for i, dim in enumerate(newdims):
+            if dim < 0:
+                if unknownidx == -1:
+                    unknownidx = i
+                else:
+                    raise ValueError("can only specify one unknown dimension")
+            else:
+                knownsize *= dim
+
+        # compute the missing dimension
+        if unknownidx >= 0:
+            if knownsize == 0 or self.size % knownsize != 0:
+                raise ValueError("cannot infer valid shape "
+                                 "for unknown dimension")
+            else:
+                newdims = newdims[0:unknownidx] \
+                    + (self.size // knownsize,) \
+                    + newdims[unknownidx + 1:]
+
+        newsize = functools.reduce(operator.mul, newdims, 1)
+
+        if order == 'A':
+            order = 'F' if self.is_f_contig else 'C'
+
+        if newsize != self.size:
+            raise ValueError("reshape changes the size of the array")
+
+        if self.is_c_contig or self.is_f_contig:
+            if order == 'C':
+                newstrides = list(iter_strides_c_contig(self, newdims))
+            elif order == 'F':
+                newstrides = list(iter_strides_f_contig(self, newdims))
+            else:
+                raise AssertionError("unreachable")
+        else:
+            newstrides = np.empty(newnd, np.ctypeslib.c_intp)
+
+            # need to keep these around in variables, not temporaries, so they
+            # don't get GC'ed before we call into the C code
+            olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
+            oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
+            newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
+
+            if not attempt_nocopy_reshape(
+                oldnd,
+                olddims,
+                oldstrides,
+                newnd,
+                newdims,
+                newstrides,
+                self.itemsize,
+                order == 'F',
+            ):
+                raise NotImplementedError('reshape would require copy')
+
+        ret = self.from_desc(self.extent.begin, shape=newdims,
+                             strides=newstrides, itemsize=self.itemsize)
+
+        return ret, list(self.iter_contiguous_extent())
+
+    def squeeze(self, axis=None):
+        newshape, newstrides = [], []
+        if axis is None:
+            for length, stride in zip(self.shape, self.strides):
+                if length != 1:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        else:
+            if not isinstance(axis, tuple):
+                axis = (axis,)
+            for ax in axis:
+                if self.shape[ax] != 1:
+                    raise ValueError(
+                        "cannot select an axis to squeeze out which has size "
+                        "not equal to one"
+                    )
+            for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
+                if i not in axis:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        newarr = self.from_desc(
+            self.extent.begin,
+            shape=newshape,
+            strides=newstrides,
+            itemsize=self.itemsize,
+        )
+        return newarr, list(self.iter_contiguous_extent())
+
+    def ravel(self, order='C'):
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        if (order in 'CA' and self.is_c_contig
+                or order in 'FA' and self.is_f_contig):
+            newshape = (self.size,)
+            newstrides = (self.itemsize,)
+            arr = self.from_desc(self.extent.begin, newshape, newstrides,
+                                 self.itemsize)
+            return arr, list(self.iter_contiguous_extent())
+
+        else:
+            raise NotImplementedError("ravel on non-contiguous array")
+
+
+def iter_strides_f_contig(arr, shape=None):
+    """yields the f-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+    yield itemsize
+    sum = 1
+    for s in shape[:-1]:
+        sum *= s
+        yield sum * itemsize
+
+
+def iter_strides_c_contig(arr, shape=None):
+    """yields the c-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+
+    def gen():
+        yield itemsize
+        sum = 1
+        for s in reversed(shape[1:]):
+            sum *= s
+            yield sum * itemsize
+
+    for i in reversed(list(gen())):
+        yield i
+
+
+def is_element_indexing(item, ndim):
+    if isinstance(item, slice):
+        return False
+
+    elif isinstance(item, tuple):
+        if len(item) == ndim:
+            if not any(isinstance(it, slice) for it in item):
+                return True
+
+    else:
+        return True
+
+    return False
+
+
+def _compute_size(start, stop, step):
+    """Algorithm adapted from cpython rangeobject.c
+    """
+    if step > 0:
+        lo = start
+        hi = stop
+    else:
+        lo = stop
+        hi = start
+        step = -step
+    if lo >= hi:
+        return 0
+    return (hi - lo - 1) // step + 1
diff --git a/numba_cuda/numba/cuda/deviceufunc.py b/numba_cuda/numba/cuda/deviceufunc.py
new file mode 100644
index 0000000..c29335a
--- /dev/null
+++ b/numba_cuda/numba/cuda/deviceufunc.py
@@ -0,0 +1,908 @@
+"""
+Implements custom ufunc dispatch mechanism for non-CPU devices.
+"""
+
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+import operator
+import warnings
+from functools import reduce
+
+import numpy as np
+
+from numba.np.ufunc.ufuncbuilder import _BaseUFuncBuilder, parse_identity
+from numba.core import types, sigutils
+from numba.core.typing import signature
+from numba.np.ufunc.sigparse import parse_signature
+
+
+def _broadcast_axis(a, b):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    if a == b:
+        return a
+    elif a == 1:
+        return b
+    elif b == 1:
+        return a
+    else:
+        raise ValueError("failed to broadcast {0} and {1}".format(a, b))
+
+
+def _pairwise_broadcast(shape1, shape2):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    shape1, shape2 = map(tuple, [shape1, shape2])
+
+    while len(shape1) < len(shape2):
+        shape1 = (1,) + shape1
+
+    while len(shape1) > len(shape2):
+        shape2 = (1,) + shape2
+
+    return tuple(_broadcast_axis(a, b) for a, b in zip(shape1, shape2))
+
+
+def _multi_broadcast(*shapelist):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    assert shapelist
+
+    result = shapelist[0]
+    others = shapelist[1:]
+    try:
+        for i, each in enumerate(others, start=1):
+            result = _pairwise_broadcast(result, each)
+    except ValueError:
+        raise ValueError("failed to broadcast argument #{0}".format(i))
+    else:
+        return result
+
+
+class UFuncMechanism(object):
+    """
+    Prepare ufunc arguments for vectorize.
+    """
+    DEFAULT_STREAM = None
+    SUPPORT_DEVICE_SLICING = False
+
+    def __init__(self, typemap, args):
+        """Never used directly by user. Invoke by UFuncMechanism.call().
+        """
+        self.typemap = typemap
+        self.args = args
+        nargs = len(self.args)
+        self.argtypes = [None] * nargs
+        self.scalarpos = []
+        self.signature = None
+        self.arrays = [None] * nargs
+
+    def _fill_arrays(self):
+        """
+        Get all arguments in array form
+        """
+        for i, arg in enumerate(self.args):
+            if self.is_device_array(arg):
+                self.arrays[i] = self.as_device_array(arg)
+            elif isinstance(arg, (int, float, complex, np.number)):
+                # Is scalar
+                self.scalarpos.append(i)
+            else:
+                self.arrays[i] = np.asarray(arg)
+
+    def _fill_argtypes(self):
+        """
+        Get dtypes
+        """
+        for i, ary in enumerate(self.arrays):
+            if ary is not None:
+                dtype = getattr(ary, 'dtype')
+                if dtype is None:
+                    dtype = np.asarray(ary).dtype
+                self.argtypes[i] = dtype
+
+    def _resolve_signature(self):
+        """Resolve signature.
+        May have ambiguous case.
+        """
+        matches = []
+        # Resolve scalar args exact match first
+        if self.scalarpos:
+            # Try resolve scalar arguments
+            for formaltys in self.typemap:
+                match_map = []
+                for i, (formal, actual) in enumerate(zip(formaltys,
+                                                         self.argtypes)):
+                    if actual is None:
+                        actual = np.asarray(self.args[i]).dtype
+
+                    match_map.append(actual == formal)
+
+                if all(match_map):
+                    matches.append(formaltys)
+
+        # No matching with exact match; try coercing the scalar arguments
+        if not matches:
+            matches = []
+            for formaltys in self.typemap:
+                all_matches = all(actual is None or formal == actual
+                                  for formal, actual in
+                                  zip(formaltys, self.argtypes))
+                if all_matches:
+                    matches.append(formaltys)
+
+        if not matches:
+            raise TypeError("No matching version.  GPU ufunc requires array "
+                            "arguments to have the exact types.  This behaves "
+                            "like regular ufunc with casting='no'.")
+
+        if len(matches) > 1:
+            raise TypeError("Failed to resolve ufunc due to ambiguous "
+                            "signature. Too many untyped scalars. "
+                            "Use numpy dtype object to type tag.")
+
+        # Try scalar arguments
+        self.argtypes = matches[0]
+
+    def _get_actual_args(self):
+        """Return the actual arguments
+        Casts scalar arguments to np.array.
+        """
+        for i in self.scalarpos:
+            self.arrays[i] = np.array([self.args[i]], dtype=self.argtypes[i])
+
+        return self.arrays
+
+    def _broadcast(self, arys):
+        """Perform numpy ufunc broadcasting
+        """
+        shapelist = [a.shape for a in arys]
+        shape = _multi_broadcast(*shapelist)
+
+        for i, ary in enumerate(arys):
+            if ary.shape == shape:
+                pass
+
+            else:
+                if self.is_device_array(ary):
+                    arys[i] = self.broadcast_device(ary, shape)
+
+                else:
+                    ax_differs = [ax for ax in range(len(shape))
+                                  if ax >= ary.ndim
+                                  or ary.shape[ax] != shape[ax]]
+
+                    missingdim = len(shape) - len(ary.shape)
+                    strides = [0] * missingdim + list(ary.strides)
+
+                    for ax in ax_differs:
+                        strides[ax] = 0
+
+                    strided = np.lib.stride_tricks.as_strided(ary,
+                                                              shape=shape,
+                                                              strides=strides)
+
+                    arys[i] = self.force_array_layout(strided)
+
+        return arys
+
+    def get_arguments(self):
+        """Prepare and return the arguments for the ufunc.
+        Does not call to_device().
+        """
+        self._fill_arrays()
+        self._fill_argtypes()
+        self._resolve_signature()
+        arys = self._get_actual_args()
+        return self._broadcast(arys)
+
+    def get_function(self):
+        """Returns (result_dtype, function)
+        """
+        return self.typemap[self.argtypes]
+
+    def is_device_array(self, obj):
+        """Is the `obj` a device array?
+        Override in subclass
+        """
+        return False
+
+    def as_device_array(self, obj):
+        """Convert the `obj` to a device array
+        Override in subclass
+
+        Default implementation is an identity function
+        """
+        return obj
+
+    def broadcast_device(self, ary, shape):
+        """Handles ondevice broadcasting
+
+        Override in subclass to add support.
+        """
+        raise NotImplementedError("broadcasting on device is not supported")
+
+    def force_array_layout(self, ary):
+        """Ensures array layout met device requirement.
+
+        Override in sublcass
+        """
+        return ary
+
+    @classmethod
+    def call(cls, typemap, args, kws):
+        """Perform the entire ufunc call mechanism.
+        """
+        # Handle keywords
+        stream = kws.pop('stream', cls.DEFAULT_STREAM)
+        out = kws.pop('out', None)
+
+        if kws:
+            warnings.warn("unrecognized keywords: %s" % ', '.join(kws))
+
+        # Begin call resolution
+        cr = cls(typemap, args)
+        args = cr.get_arguments()
+        resty, func = cr.get_function()
+
+        outshape = args[0].shape
+
+        # Adjust output value
+        if out is not None and cr.is_device_array(out):
+            out = cr.as_device_array(out)
+
+        def attempt_ravel(a):
+            if cr.SUPPORT_DEVICE_SLICING:
+                raise NotImplementedError
+
+            try:
+                # Call the `.ravel()` method
+                return a.ravel()
+            except NotImplementedError:
+                # If it is not a device array
+                if not cr.is_device_array(a):
+                    raise
+                # For device array, retry ravel on the host by first
+                # copying it back.
+                else:
+                    hostary = cr.to_host(a, stream).ravel()
+                    return cr.to_device(hostary, stream)
+
+        if args[0].ndim > 1:
+            args = [attempt_ravel(a) for a in args]
+
+        # Prepare argument on the device
+        devarys = []
+        any_device = False
+        for a in args:
+            if cr.is_device_array(a):
+                devarys.append(a)
+                any_device = True
+            else:
+                dev_a = cr.to_device(a, stream=stream)
+                devarys.append(dev_a)
+
+        # Launch
+        shape = args[0].shape
+        if out is None:
+            # No output is provided
+            devout = cr.allocate_device_array(shape, resty, stream=stream)
+
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+
+            if any_device:
+                # If any of the arguments are on device,
+                # Keep output on the device
+                return devout.reshape(outshape)
+            else:
+                # Otherwise, transfer output back to host
+                return devout.copy_to_host().reshape(outshape)
+
+        elif cr.is_device_array(out):
+            # If output is provided and it is a device array,
+            # Return device array
+            if out.ndim > 1:
+                out = attempt_ravel(out)
+            devout = out
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+            return devout.reshape(outshape)
+
+        else:
+            # If output is provided and it is a host array,
+            # Return host array
+            assert out.shape == shape
+            assert out.dtype == resty
+            devout = cr.allocate_device_array(shape, resty, stream=stream)
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+            return devout.copy_to_host(out, stream=stream).reshape(outshape)
+
+    def to_device(self, hostary, stream):
+        """Implement to device transfer
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def to_host(self, devary, stream):
+        """Implement to host transfer
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def allocate_device_array(self, shape, dtype, stream):
+        """Implements device allocation
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def launch(self, func, count, stream, args):
+        """Implements device function invocation
+        Override in subclass
+        """
+        raise NotImplementedError
+
+
+def to_dtype(ty):
+    if isinstance(ty, types.EnumMember):
+        ty = ty.dtype
+    return np.dtype(str(ty))
+
+
+class DeviceVectorize(_BaseUFuncBuilder):
+    def __init__(self, func, identity=None, cache=False, targetoptions={}):
+        if cache:
+            raise TypeError("caching is not supported")
+        for opt in targetoptions:
+            if opt == 'nopython':
+                warnings.warn("nopython kwarg for cuda target is redundant",
+                              RuntimeWarning)
+            else:
+                fmt = "Unrecognized options. "
+                fmt += "cuda vectorize target does not support option: '%s'"
+                raise KeyError(fmt % opt)
+        self.py_func = func
+        self.identity = parse_identity(identity)
+        # { arg_dtype: (return_dtype), cudakernel }
+        self.kernelmap = OrderedDict()
+
+    @property
+    def pyfunc(self):
+        return self.py_func
+
+    def add(self, sig=None):
+        # compile core as device function
+        args, return_type = sigutils.normalize_signature(sig)
+        devfnsig = signature(return_type, *args)
+
+        funcname = self.pyfunc.__name__
+        kernelsource = self._get_kernel_source(self._kernel_template,
+                                               devfnsig, funcname)
+        corefn, return_type = self._compile_core(devfnsig)
+        glbl = self._get_globals(corefn)
+        sig = signature(types.void, *([a[:] for a in args] + [return_type[:]]))
+        exec(kernelsource, glbl)
+
+        stager = glbl['__vectorized_%s' % funcname]
+        kernel = self._compile_kernel(stager, sig)
+
+        argdtypes = tuple(to_dtype(t) for t in devfnsig.args)
+        resdtype = to_dtype(return_type)
+        self.kernelmap[tuple(argdtypes)] = resdtype, kernel
+
+    def build_ufunc(self):
+        raise NotImplementedError
+
+    def _get_kernel_source(self, template, sig, funcname):
+        args = ['a%d' % i for i in range(len(sig.args))]
+        fmts = dict(name=funcname,
+                    args=', '.join(args),
+                    argitems=', '.join('%s[__tid__]' % i for i in args))
+        return template.format(**fmts)
+
+    def _compile_core(self, sig):
+        raise NotImplementedError
+
+    def _get_globals(self, corefn):
+        raise NotImplementedError
+
+    def _compile_kernel(self, fnobj, sig):
+        raise NotImplementedError
+
+
+class DeviceGUFuncVectorize(_BaseUFuncBuilder):
+    def __init__(self, func, sig, identity=None, cache=False, targetoptions={},
+                 writable_args=()):
+        if cache:
+            raise TypeError("caching is not supported")
+        if writable_args:
+            raise TypeError("writable_args are not supported")
+
+        # Allow nopython flag to be set.
+        if not targetoptions.pop('nopython', True):
+            raise TypeError("nopython flag must be True")
+        # Are there any more target options?
+        if targetoptions:
+            opts = ', '.join([repr(k) for k in targetoptions.keys()])
+            fmt = "The following target options are not supported: {0}"
+            raise TypeError(fmt.format(opts))
+
+        self.py_func = func
+        self.identity = parse_identity(identity)
+        self.signature = sig
+        self.inputsig, self.outputsig = parse_signature(self.signature)
+
+        # Maps from a tuple of input_dtypes to (output_dtypes, kernel)
+        self.kernelmap = OrderedDict()
+
+    @property
+    def pyfunc(self):
+        return self.py_func
+
+    def add(self, sig=None):
+        indims = [len(x) for x in self.inputsig]
+        outdims = [len(x) for x in self.outputsig]
+        args, return_type = sigutils.normalize_signature(sig)
+
+        # It is only valid to specify types.none as a return type, or to not
+        # specify the return type (where the "Python None" is the return type)
+        valid_return_type = return_type in (types.none, None)
+        if not valid_return_type:
+            raise TypeError('guvectorized functions cannot return values: '
+                            f'signature {sig} specifies {return_type} return '
+                            'type')
+
+        funcname = self.py_func.__name__
+        src = expand_gufunc_template(self._kernel_template, indims,
+                                     outdims, funcname, args)
+
+        glbls = self._get_globals(sig)
+
+        exec(src, glbls)
+        fnobj = glbls['__gufunc_{name}'.format(name=funcname)]
+
+        outertys = list(_determine_gufunc_outer_types(args, indims + outdims))
+        kernel = self._compile_kernel(fnobj, sig=tuple(outertys))
+
+        nout = len(outdims)
+        dtypes = [np.dtype(str(t.dtype)) for t in outertys]
+        indtypes = tuple(dtypes[:-nout])
+        outdtypes = tuple(dtypes[-nout:])
+
+        self.kernelmap[indtypes] = outdtypes, kernel
+
+    def _compile_kernel(self, fnobj, sig):
+        raise NotImplementedError
+
+    def _get_globals(self, sig):
+        raise NotImplementedError
+
+
+def _determine_gufunc_outer_types(argtys, dims):
+    for at, nd in zip(argtys, dims):
+        if isinstance(at, types.Array):
+            yield at.copy(ndim=nd + 1)
+        else:
+            if nd > 0:
+                raise ValueError("gufunc signature mismatch: ndim>0 for scalar")
+            yield types.Array(dtype=at, ndim=1, layout='A')
+
+
+def expand_gufunc_template(template, indims, outdims, funcname, argtypes):
+    """Expand gufunc source template
+    """
+    argdims = indims + outdims
+    argnames = ["arg{0}".format(i) for i in range(len(argdims))]
+    checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a)
+                                              for a in argnames]))
+    inputs = [_gen_src_for_indexing(aref, adims, atype)
+              for aref, adims, atype in zip(argnames, indims, argtypes)]
+    outputs = [_gen_src_for_indexing(aref, adims, atype)
+               for aref, adims, atype in zip(argnames[len(indims):], outdims,
+                                             argtypes[len(indims):])]
+    argitems = inputs + outputs
+    src = template.format(name=funcname, args=', '.join(argnames),
+                          checkedarg=checkedarg,
+                          argitems=', '.join(argitems))
+    return src
+
+
+def _gen_src_for_indexing(aref, adims, atype):
+    return "{aref}[{sliced}]".format(aref=aref,
+                                     sliced=_gen_src_index(adims, atype))
+
+
+def _gen_src_index(adims, atype):
+    if adims > 0:
+        return ','.join(['__tid__'] + [':'] * adims)
+    elif isinstance(atype, types.Array) and atype.ndim - 1 == adims:
+        # Special case for 0-nd in shape-signature but
+        # 1d array in type signature.
+        # Slice it so that the result has the same dimension.
+        return '__tid__:(__tid__ + 1)'
+    else:
+        return '__tid__'
+
+
+class GUFuncEngine(object):
+    '''Determine how to broadcast and execute a gufunc
+    base on input shape and signature
+    '''
+
+    @classmethod
+    def from_signature(cls, signature):
+        return cls(*parse_signature(signature))
+
+    def __init__(self, inputsig, outputsig):
+        # signatures
+        self.sin = inputsig
+        self.sout = outputsig
+        # argument count
+        self.nin = len(self.sin)
+        self.nout = len(self.sout)
+
+    def schedule(self, ishapes):
+        if len(ishapes) != self.nin:
+            raise TypeError('invalid number of input argument')
+
+        # associate symbol values for input signature
+        symbolmap = {}
+        outer_shapes = []
+        inner_shapes = []
+
+        for argn, (shape, symbols) in enumerate(zip(ishapes, self.sin)):
+            argn += 1  # start from 1 for human
+            inner_ndim = len(symbols)
+            if len(shape) < inner_ndim:
+                fmt = "arg #%d: insufficient inner dimension"
+                raise ValueError(fmt % (argn,))
+            if inner_ndim:
+                inner_shape = shape[-inner_ndim:]
+                outer_shape = shape[:-inner_ndim]
+            else:
+                inner_shape = ()
+                outer_shape = shape
+
+            for axis, (dim, sym) in enumerate(zip(inner_shape, symbols)):
+                axis += len(outer_shape)
+                if sym in symbolmap:
+                    if symbolmap[sym] != dim:
+                        fmt = "arg #%d: shape[%d] mismatch argument"
+                        raise ValueError(fmt % (argn, axis))
+                symbolmap[sym] = dim
+
+            outer_shapes.append(outer_shape)
+            inner_shapes.append(inner_shape)
+
+        # solve output shape
+        oshapes = []
+        for outsig in self.sout:
+            oshape = []
+            for sym in outsig:
+                oshape.append(symbolmap[sym])
+            oshapes.append(tuple(oshape))
+
+        # find the biggest outershape as looping dimension
+        sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
+        largest_i = np.argmax(sizes)
+        loopdims = outer_shapes[largest_i]
+
+        pinned = [False] * self.nin  # same argument for each iteration
+        for i, d in enumerate(outer_shapes):
+            if d != loopdims:
+                if d == (1,) or d == ():
+                    pinned[i] = True
+                else:
+                    fmt = "arg #%d: outer dimension mismatch"
+                    raise ValueError(fmt % (i + 1,))
+
+        return GUFuncSchedule(self, inner_shapes, oshapes, loopdims, pinned)
+
+
+class GUFuncSchedule(object):
+    def __init__(self, parent, ishapes, oshapes, loopdims, pinned):
+        self.parent = parent
+        # core shapes
+        self.ishapes = ishapes
+        self.oshapes = oshapes
+        # looping dimension
+        self.loopdims = loopdims
+        self.loopn = reduce(operator.mul, loopdims, 1)
+        # flags
+        self.pinned = pinned
+
+        self.output_shapes = [loopdims + s for s in oshapes]
+
+    def __str__(self):
+        import pprint
+
+        attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned'
+        values = [(k, getattr(self, k)) for k in attrs]
+        return pprint.pformat(dict(values))
+
+
+class GeneralizedUFunc(object):
+    def __init__(self, kernelmap, engine):
+        self.kernelmap = kernelmap
+        self.engine = engine
+        self.max_blocksize = 2 ** 30
+
+    def __call__(self, *args, **kws):
+        callsteps = self._call_steps(self.engine.nin, self.engine.nout,
+                                     args, kws)
+        indtypes, schedule, outdtypes, kernel = self._schedule(
+            callsteps.inputs, callsteps.outputs)
+        callsteps.adjust_input_types(indtypes)
+
+        outputs = callsteps.prepare_outputs(schedule, outdtypes)
+        inputs = callsteps.prepare_inputs()
+        parameters = self._broadcast(schedule, inputs, outputs)
+
+        callsteps.launch_kernel(kernel, schedule.loopn, parameters)
+
+        return callsteps.post_process_outputs(outputs)
+
+    def _schedule(self, inputs, outs):
+        input_shapes = [a.shape for a in inputs]
+        schedule = self.engine.schedule(input_shapes)
+
+        # find kernel
+        indtypes = tuple(i.dtype for i in inputs)
+        try:
+            outdtypes, kernel = self.kernelmap[indtypes]
+        except KeyError:
+            # No exact match, then use the first compatible.
+            # This does not match the numpy dispatching exactly.
+            # Later, we may just jit a new version for the missing signature.
+            indtypes = self._search_matching_signature(indtypes)
+            # Select kernel
+            outdtypes, kernel = self.kernelmap[indtypes]
+
+        # check output
+        for sched_shape, out in zip(schedule.output_shapes, outs):
+            if out is not None and sched_shape != out.shape:
+                raise ValueError('output shape mismatch')
+
+        return indtypes, schedule, outdtypes, kernel
+
+    def _search_matching_signature(self, idtypes):
+        """
+        Given the input types in `idtypes`, return a compatible sequence of
+        types that is defined in `kernelmap`.
+
+        Note: Ordering is guaranteed by `kernelmap` being a OrderedDict
+        """
+        for sig in self.kernelmap.keys():
+            if all(np.can_cast(actual, desired)
+                   for actual, desired in zip(sig, idtypes)):
+                return sig
+        else:
+            raise TypeError("no matching signature")
+
+    def _broadcast(self, schedule, params, retvals):
+        assert schedule.loopn > 0, "zero looping dimension"
+
+        odim = 1 if not schedule.loopdims else schedule.loopn
+        newparams = []
+        for p, cs in zip(params, schedule.ishapes):
+            if not cs and p.size == 1:
+                # Broadcast scalar input
+                devary = self._broadcast_scalar_input(p, odim)
+                newparams.append(devary)
+            else:
+                # Broadcast vector input
+                newparams.append(self._broadcast_array(p, odim, cs))
+
+        newretvals = []
+        for retval, oshape in zip(retvals, schedule.oshapes):
+            newretvals.append(retval.reshape(odim, *oshape))
+        return tuple(newparams) + tuple(newretvals)
+
+    def _broadcast_array(self, ary, newdim, innerdim):
+        newshape = (newdim,) + innerdim
+        # No change in shape
+        if ary.shape == newshape:
+            return ary
+
+        # Creating new dimension
+        elif len(ary.shape) < len(newshape):
+            assert newshape[-len(ary.shape):] == ary.shape, \
+                "cannot add dim and reshape at the same time"
+            return self._broadcast_add_axis(ary, newshape)
+
+        # Collapsing dimension
+        else:
+            return ary.reshape(*newshape)
+
+    def _broadcast_add_axis(self, ary, newshape):
+        raise NotImplementedError("cannot add new axis")
+
+    def _broadcast_scalar_input(self, ary, shape):
+        raise NotImplementedError
+
+
+class GUFuncCallSteps(metaclass=ABCMeta):
+    """
+    Implements memory management and kernel launch operations for GUFunc calls.
+
+    One instance of this class is instantiated for each call, and the instance
+    is specific to the arguments given to the GUFunc call.
+
+    The base class implements the overall logic; subclasses provide
+    target-specific implementations of individual functions.
+    """
+
+    # The base class uses these slots; subclasses may provide additional slots.
+    __slots__ = [
+        'outputs',
+        'inputs',
+        '_copy_result_to_host',
+    ]
+
+    @abstractmethod
+    def launch_kernel(self, kernel, nelem, args):
+        """Implement the kernel launch"""
+
+    @abstractmethod
+    def is_device_array(self, obj):
+        """
+        Return True if `obj` is a device array for this target, False
+        otherwise.
+        """
+
+    @abstractmethod
+    def as_device_array(self, obj):
+        """
+        Return `obj` as a device array on this target.
+
+        May return `obj` directly if it is already on the target.
+        """
+
+    @abstractmethod
+    def to_device(self, hostary):
+        """
+        Copy `hostary` to the device and return the device array.
+        """
+
+    @abstractmethod
+    def allocate_device_array(self, shape, dtype):
+        """
+        Allocate a new uninitialized device array with the given shape and
+        dtype.
+        """
+
+    def __init__(self, nin, nout, args, kwargs):
+        outputs = kwargs.get('out')
+
+        # Ensure the user has passed a correct number of arguments
+        if outputs is None and len(args) not in (nin, (nin + nout)):
+            def pos_argn(n):
+                return f'{n} positional argument{"s" * (n != 1)}'
+
+            msg = (f'This gufunc accepts {pos_argn(nin)} (when providing '
+                   f'input only) or {pos_argn(nin + nout)} (when providing '
+                   f'input and output). Got {pos_argn(len(args))}.')
+            raise TypeError(msg)
+
+        if outputs is not None and len(args) > nin:
+            raise ValueError("cannot specify argument 'out' as both positional "
+                             "and keyword")
+        else:
+            # If the user did not pass outputs either in the out kwarg or as
+            # positional arguments, then we need to generate an initial list of
+            # "placeholder" outputs using None as a sentry value
+            outputs = [outputs] * nout
+
+        # Ensure all output device arrays are Numba device arrays - for
+        # example, any output passed in that supports the CUDA Array Interface
+        # is converted to a Numba CUDA device array; others are left untouched.
+        all_user_outputs_are_host = True
+        self.outputs = []
+        for output in outputs:
+            if self.is_device_array(output):
+                self.outputs.append(self.as_device_array(output))
+                all_user_outputs_are_host = False
+            else:
+                self.outputs.append(output)
+
+        all_host_arrays = not any([self.is_device_array(a) for a in args])
+
+        # - If any of the arguments are device arrays, we leave the output on
+        #   the device.
+        self._copy_result_to_host = (all_host_arrays and
+                                     all_user_outputs_are_host)
+
+        # Normalize arguments - ensure they are either device- or host-side
+        # arrays (as opposed to lists, tuples, etc).
+        def normalize_arg(a):
+            if self.is_device_array(a):
+                convert = self.as_device_array
+            else:
+                convert = np.asarray
+
+            return convert(a)
+
+        normalized_args = [normalize_arg(a) for a in args]
+        self.inputs = normalized_args[:nin]
+
+        # Check if there are extra arguments for outputs.
+        unused_inputs = normalized_args[nin:]
+        if unused_inputs:
+            self.outputs = unused_inputs
+
+    def adjust_input_types(self, indtypes):
+        """
+        Attempt to cast the inputs to the required types if necessary
+        and if they are not device arrays.
+
+        Side effect: Only affects the elements of `inputs` that require
+        a type cast.
+        """
+        for i, (ity, val) in enumerate(zip(indtypes, self.inputs)):
+            if ity != val.dtype:
+                if not hasattr(val, 'astype'):
+                    msg = ("compatible signature is possible by casting but "
+                           "{0} does not support .astype()").format(type(val))
+                    raise TypeError(msg)
+                # Cast types
+                self.inputs[i] = val.astype(ity)
+
+    def prepare_outputs(self, schedule, outdtypes):
+        """
+        Returns a list of output parameters that all reside on the target
+        device.
+
+        Outputs that were passed-in to the GUFunc are used if they reside on the
+        device; other outputs are allocated as necessary.
+        """
+        outputs = []
+        for shape, dtype, output in zip(schedule.output_shapes, outdtypes,
+                                        self.outputs):
+            if output is None or self._copy_result_to_host:
+                output = self.allocate_device_array(shape, dtype)
+            outputs.append(output)
+
+        return outputs
+
+    def prepare_inputs(self):
+        """
+        Returns a list of input parameters that all reside on the target device.
+        """
+        def ensure_device(parameter):
+            if self.is_device_array(parameter):
+                convert = self.as_device_array
+            else:
+                convert = self.to_device
+
+            return convert(parameter)
+
+        return [ensure_device(p) for p in self.inputs]
+
+    def post_process_outputs(self, outputs):
+        """
+        Moves the given output(s) to the host if necessary.
+
+        Returns a single value (e.g. an array) if there was one output, or a
+        tuple of arrays if there were multiple. Although this feels a little
+        jarring, it is consistent with the behavior of GUFuncs in general.
+        """
+        if self._copy_result_to_host:
+            outputs = [self.to_host(output, self_output)
+                       for output, self_output in zip(outputs, self.outputs)]
+        elif self.outputs[0] is not None:
+            outputs = self.outputs
+
+        if len(outputs) == 1:
+            return outputs[0]
+        else:
+            return tuple(outputs)
diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
new file mode 100644
index 0000000..6ced244
--- /dev/null
+++ b/numba_cuda/numba/cuda/runtime/nrt.cu
@@ -0,0 +1,107 @@
+#ifndef _NRT_H
+#define _NRT_H
+
+#include <cuda/atomic>
+
+typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+
+typedef struct MemInfo NRT_MemInfo;
+
+extern "C" {
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+}
+
+// Globally needed variables
+struct NRT_MemSys {
+  struct {
+    bool enabled;
+    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> free;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+  } stats;
+};
+
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+
+extern "C" __device__ void* NRT_Allocate(size_t size)
+{
+  void* ptr = NULL;
+  ptr       = malloc(size);
+//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  return ptr;
+}
+
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info)
+//                                            NRT_MemSys* TheMSys)
+{
+  mi->refct     = 1; /* starts with 1 refct */
+  mi->dtor      = dtor;
+  mi->dtor_info = dtor_info;
+  mi->data      = data;
+  mi->size      = size;
+//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+}
+
+__device__ NRT_MemInfo* NRT_MemInfo_new(
+  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
+{
+  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
+  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
+  return mi;
+}
+
+extern "C" __device__ void NRT_Free(void* ptr)
+{
+  free(ptr);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+}
+
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
+{
+  NRT_Free(mi);
+}
+
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
+{
+  NRT_dealloc(mi);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+}
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
+{
+  if (mi->dtor) /* We have a destructor */
+    mi->dtor(mi->data, mi->size, NULL);
+  /* Clear and release MemInfo */
+  NRT_MemInfo_destroy(mi);
+}
+
+/*
+  c++ version of the NRT_decref function that usually is added to
+  the final kernel link in PTX form by numba. This version may be
+  used by c++ APIs that accept ownership of live objects and must
+  manage them going forward.
+*/
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
+{
+  mi->refct--;
+  if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
+}
+
+#endif
+
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
+{
+  mi->refct++;
+}
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py
new file mode 100644
index 0000000..adabaa7
--- /dev/null
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py
@@ -0,0 +1,4 @@
+# Dummy arrays are not implemented in the simulator. This file allows the dummy
+# array tests to be imported, but they are skipped on the simulator.
+
+Array = None
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 6402ff3..f43b099 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -67,6 +67,7 @@ def resolve_value_type(self, val):
 class CUDATargetContext(BaseContext):
     implement_powi_as_math_call = True
     strict_alignment = True
+    enable_nrt = True
 
     def __init__(self, typingctx, target='cuda'):
         super().__init__(typingctx, target)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
index 2a240ef..fb8de32 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
@@ -1,4 +1,4 @@
-from numba.np.ufunc.deviceufunc import GUFuncEngine
+from numba.cuda.deviceufunc import GUFuncEngine
 import unittest
 
 
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
index e88bbc9..657e9a1 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
@@ -145,7 +145,8 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False):
         if driver.USE_NV_BINDING:
             self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved)
         else:
-            self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle))
+            self.assertEqual(ipch_recon.handle.reserved[:],
+                             ipch.handle.reserved[:])
 
         # spawn new process for testing
         ctx = mp.get_context('spawn')
@@ -262,7 +263,10 @@ def test_staged(self):
         if driver.USE_NV_BINDING:
             self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved)
         else:
-            self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle))
+            self.assertEqual(
+                ipch_recon.handle.reserved[:],
+                ipch.handle.reserved[:]
+            )
         self.assertEqual(ipch_recon.size, ipch.size)
 
         # Test on every CUDA devices
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
new file mode 100644
index 0000000..e4ad7d0
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
@@ -0,0 +1,359 @@
+import unittest
+import itertools
+import numpy as np
+from numba.cuda.cudadrv.dummyarray import Array
+from numba.cuda.testing import skip_on_cudasim
+
+
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestSlicing(unittest.TestCase):
+
+    def assertSameContig(self, arr, nparr):
+        attrs = 'C_CONTIGUOUS', 'F_CONTIGUOUS'
+        for attr in attrs:
+            if arr.flags[attr] != nparr.flags[attr]:
+                if arr.size == 0 and nparr.size == 0:
+                    # numpy <=1.7 bug that some empty array are contiguous and
+                    # some are not
+                    pass
+                else:
+                    self.fail("contiguous flag mismatch:\ngot=%s\nexpect=%s" %
+                              (arr.flags, nparr.flags))
+
+    #### 1D
+
+    def test_slice0_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        self.assertSameContig(arr, nparr)
+        xx = -2, -1, 0, 1, 2
+        for x in xx:
+            expect = nparr[x:]
+            got = arr[x:]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_slice1_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, -1, 0, 1, 2
+        for x in xx:
+            expect = nparr[:x]
+            got = arr[:x]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_slice2_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, -1, 0, 1, 2
+        for x, y in itertools.product(xx, xx):
+            expect = nparr[x:y]
+            got = arr[x:y]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    #### 2D
+
+    def test_slice0_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, 0, 1, 2
+        for x in xx:
+            expect = nparr[x:]
+            got = arr[x:]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+        for x, y in itertools.product(xx, xx):
+            expect = nparr[x:, y:]
+            got = arr[x:, y:]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_slice1_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, 0, 2
+        for x in xx:
+            expect = nparr[:x]
+            got = arr[:x]
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+            self.assertSameContig(got, expect)
+
+        for x, y in itertools.product(xx, xx):
+            expect = nparr[:x, :y]
+            got = arr[:x, :y]
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+            self.assertSameContig(got, expect)
+
+    def test_slice2_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, 0, 2
+        for s, t, u, v in itertools.product(xx, xx, xx, xx):
+            expect = nparr[s:t, u:v]
+            got = arr[s:t, u:v]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+        for x, y in itertools.product(xx, xx):
+            expect = nparr[s:t, u:v]
+            got = arr[s:t, u:v]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    #### Strided
+
+    def test_strided_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, -1, 1, 2
+        for x in xx:
+            expect = nparr[::x]
+            got = arr[::x]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_strided_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, -1, 1, 2
+        for a, b in itertools.product(xx, xx):
+            expect = nparr[::a, ::b]
+            got = arr[::a, ::b]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_strided_3d(self):
+        nparr = np.empty((4, 5, 6))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        xx = -2, -1, 1, 2
+        for a, b, c in itertools.product(xx, xx, xx):
+            expect = nparr[::a, ::b, ::c]
+            got = arr[::a, ::b, ::c]
+            self.assertSameContig(got, expect)
+            self.assertEqual(got.shape, expect.shape)
+            self.assertEqual(got.strides, expect.strides)
+
+    def test_issue_2766(self):
+        z = np.empty((1, 2, 3))
+        z = np.transpose(z, axes=(2, 0, 1))
+        arr = Array.from_desc(0, z.shape, z.strides, z.itemsize)
+        self.assertEqual(z.flags['C_CONTIGUOUS'], arr.flags['C_CONTIGUOUS'])
+        self.assertEqual(z.flags['F_CONTIGUOUS'], arr.flags['F_CONTIGUOUS'])
+
+
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestReshape(unittest.TestCase):
+    def test_reshape_2d2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(5, 4)
+        got = arr.reshape(5, 4)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_2d1d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(5 * 4)
+        got = arr.reshape(5 * 4)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_3d3d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(5, 3, 4)
+        got = arr.reshape(5, 3, 4)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_3d2d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(3 * 4, 5)
+        got = arr.reshape(3 * 4, 5)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_3d1d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(3 * 4 * 5)
+        got = arr.reshape(3 * 4 * 5)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer2d2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(-1, 4)
+        got = arr.reshape(-1, 4)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer2d1d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(-1)
+        got = arr.reshape(-1)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer3d3d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(5, -1, 4)
+        got = arr.reshape(5, -1, 4)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer3d2d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(3, -1)
+        got = arr.reshape(3, -1)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer3d1d(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        expect = nparr.reshape(-1)
+        got = arr.reshape(-1)[0]
+        self.assertEqual(got.shape, expect.shape)
+        self.assertEqual(got.strides, expect.strides)
+
+    def test_reshape_infer_two_unknowns(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+
+        with self.assertRaises(ValueError) as raises:
+            arr.reshape(-1, -1, 3)
+        self.assertIn('can only specify one unknown dimension',
+                      str(raises.exception))
+
+    def test_reshape_infer_invalid_shape(self):
+        nparr = np.empty((3, 4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+
+        with self.assertRaises(ValueError) as raises:
+            arr.reshape(-1, 7)
+
+        expected_message = 'cannot infer valid shape for unknown dimension'
+        self.assertIn(expected_message, str(raises.exception))
+
+
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestSqueeze(unittest.TestCase):
+    def test_squeeze(self):
+        nparr = np.empty((1, 2, 1, 4, 1, 3))
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
+
+        def _assert_equal_shape_strides(arr1, arr2):
+            self.assertEqual(arr1.shape, arr2.shape)
+            self.assertEqual(arr1.strides, arr2.strides)
+        _assert_equal_shape_strides(arr, nparr)
+        _assert_equal_shape_strides(arr.squeeze()[0], nparr.squeeze())
+        for axis in (0, 2, 4, (0, 2), (0, 4), (2, 4), (0, 2, 4)):
+            _assert_equal_shape_strides(
+                arr.squeeze(axis=axis)[0], nparr.squeeze(axis=axis)
+            )
+
+    def test_squeeze_invalid_axis(self):
+        nparr = np.empty((1, 2, 1, 4, 1, 3))
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
+        with self.assertRaises(ValueError):
+            arr.squeeze(axis=1)
+        with self.assertRaises(ValueError):
+            arr.squeeze(axis=(2, 3))
+
+
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestExtent(unittest.TestCase):
+    def test_extent_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        s, e = arr.extent
+        self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize)
+
+    def test_extent_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        s, e = arr.extent
+        self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize)
+
+    def test_extent_iter_1d(self):
+        nparr = np.empty(4)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        [ext] = list(arr.iter_contiguous_extent())
+        self.assertEqual(ext, arr.extent)
+
+    def test_extent_iter_2d(self):
+        nparr = np.empty((4, 5))
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+        [ext] = list(arr.iter_contiguous_extent())
+        self.assertEqual(ext, arr.extent)
+
+        self.assertEqual(len(list(arr[::2].iter_contiguous_extent())), 2)
+
+
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestIterate(unittest.TestCase):
+    def test_for_loop(self):
+        # for #4201
+        N = 5
+        nparr = np.empty(N)
+        arr = Array.from_desc(0, nparr.shape, nparr.strides,
+                              nparr.dtype.itemsize)
+
+        x = 0  # just a placeholder
+        # this loop should not raise AssertionError
+        for val in arr:
+            x = val  # noqa: F841
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/vectorizers.py b/numba_cuda/numba/cuda/vectorizers.py
index 2565988..b4c6bcf 100644
--- a/numba_cuda/numba/cuda/vectorizers.py
+++ b/numba_cuda/numba/cuda/vectorizers.py
@@ -1,8 +1,8 @@
 from numba import cuda
 from numpy import array as np_array
-from numba.np.ufunc import deviceufunc
-from numba.np.ufunc.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
-                                        GUFuncCallSteps)
+from numba.cuda import deviceufunc
+from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
+                                    GUFuncCallSteps)
 
 
 class CUDAUFuncDispatcher(object):
diff --git a/pyproject.toml b/pyproject.toml
index 8bb38e4..4f2291b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,16 +1,14 @@
-[tool.scikit-build]
-cmake.minimum-version = "3.26.4"
-cmake.verbose = true
-ninja.make-fallback = true
-build-dir = "build/{wheel_tag}"
-wheel.packages = ["numba_cuda"]
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 [build-system]
-requires = ["scikit-build-core"]
-build-backend = "scikit_build_core.build"
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools",
+    "wheel",
+]
 
 [project]
-name = "numba_cuda"
+name = "numba-cuda"
 dynamic = ["version"]
 description = "CUDA target for Numba"
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -20,6 +18,7 @@ authors = [
 ]
 license = { text = "BSD 2-clause" }
 requires-python = ">=3.9"
+dependencies = ["numba>=0.59.1"]
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/numba-cuda"
@@ -27,7 +26,14 @@ Documentation = "https://github.com/rapidsai/numba-cuda/blob/main/README.md"
 Repository = "https://github.com/rapidsai/numba-cuda"
 License = "https://github.com/rapidsai/numba-cuda/blob/main/LICENSE"
 
-[tool.scikit-build.metadata.version]
-provider = "scikit_build_core.metadata.regex"
-input = "numba_cuda/VERSION"
-regex = "(?P<value>.*)"
+[tool.setuptools.dynamic]
+version = {attr = "numba_cuda.__version__"}
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.packages.find]
+include = ["numba_cuda*"]
+
+[tool.setuptools.package-data]
+"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION"]
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..98a1061
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pathlib
+
+from setuptools import setup
+from setuptools.command.build_py import build_py
+from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder
+
+REDIRECTOR_PTH = "_numba_cuda_redirector.pth"
+REDIRECTOR_PY = "_numba_cuda_redirector.py"
+SITE_PACKAGES = pathlib.Path("site-packages")
+
+
+# Adapted from https://stackoverflow.com/a/71137790
+class build_py_with_redirector(build_py):  # noqa: N801
+    """Include the redirector files in the generated wheel."""
+
+    def copy_redirector_file(self, source, destination="."):
+        destination = pathlib.Path(self.build_lib) / destination
+        self.copy_file(str(source), str(destination), preserve_mode=0)
+
+    def run(self):
+        super().run()
+        self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PTH)
+        self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PY)
+
+    def get_source_files(self):
+        src = super().get_source_files()
+        src.extend([
+            str(SITE_PACKAGES / REDIRECTOR_PTH),
+            str(SITE_PACKAGES / REDIRECTOR_PY),
+        ])
+        return src
+
+    def get_output_mapping(self):
+        mapping = super().get_output_mapping()
+        build_lib = pathlib.Path(self.build_lib)
+        mapping[str(build_lib / REDIRECTOR_PTH)] = REDIRECTOR_PTH
+        mapping[str(build_lib / REDIRECTOR_PY)] = REDIRECTOR_PY
+        return mapping
+
+
+class TopLevelFinderWithRedirector(_TopLevelFinder):
+    """Include the redirector files in the editable wheel."""
+
+    def get_implementation(self):
+        for item in super().get_implementation():
+            yield item
+
+        with open(SITE_PACKAGES / REDIRECTOR_PTH) as f:
+            yield (REDIRECTOR_PTH, f.read())
+
+        with open(SITE_PACKAGES / REDIRECTOR_PY) as f:
+            yield (REDIRECTOR_PY, f.read())
+
+
+class editable_wheel_with_redirector(editable_wheel):
+    def _select_strategy(self, name, tag, build_lib):
+        # The default mode is "lenient" - others are "strict" and "compat".
+        # "compat" is deprecated. "strict" creates a tree of links to files in
+        # the repo. It could be implemented, but we only handle the default
+        # case for now.
+        if self.mode is not None and self.mode != "lenient":
+            raise RuntimeError("Only lenient mode is supported for editable "
+                               f"install. Current mode is {self.mode}")
+
+        return TopLevelFinderWithRedirector(self.distribution, name)
+
+
+setup(cmdclass={"build_py": build_py_with_redirector,
+                "editable_wheel": editable_wheel_with_redirector})
diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py
index 93e020a..3765fd1 100644
--- a/site-packages/_numba_cuda_redirector.py
+++ b/site-packages/_numba_cuda_redirector.py
@@ -47,8 +47,10 @@ def ensure_initialized(self):
             return False
 
         self.numba_path = numba_search_locations[0]
-        self.numba_cuda_path = str((pathlib.Path(numba_cuda_search_locations[0]) /
-                                    'numba'))
+
+        location = numba_cuda_search_locations[0]
+        self.numba_cuda_path = str((pathlib.Path(location) / 'numba'))
+
         self.initialized = True
         return True
 

From 60d4ca7a2e29708be90028308c2d7a66d5d176c6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 4 Aug 2024 10:59:30 -0700
Subject: [PATCH 08/52] style and other fixes

---
 numba_cuda/numba/cuda/cudadrv/driver.py       | 36 ++++++++++++++-----
 .../numba/cuda/cudadrv/linkable_code.py       |  1 +
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index d9d1ded..70a8c7a 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -36,6 +36,7 @@
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
+from .linkable_code import LinkableCode
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -56,6 +57,12 @@
 _py_decref.argtypes = [ctypes.py_object]
 _py_incref.argtypes = [ctypes.py_object]
 
+pynvjitlink_import_err = None
+try:
+    from pynvjitlink.api import NvJitLinker, NvJitLinkError
+except ImportError as err:
+    pynvjitlink_import_err = err
+
 
 def make_logger():
     logger = logging.getLogger(__name__)
@@ -80,6 +87,7 @@ def make_logger():
             logger.addHandler(logging.NullHandler())
     return logger
 
+
 class DeadMemoryError(RuntimeError):
     pass
 
@@ -2550,14 +2558,24 @@ class Linker(metaclass=ABCMeta):
     """Abstract base class for linkers"""
 
     @classmethod
-    def new(cls, max_registers=0, lineinfo=False, cc=None, lto=None, additional_flags=None):
+    def new(cls,
+            max_registers=0,
+            lineinfo=False,
+            cc=None,
+            lto=None,
+            additional_flags=None
+            ):
         if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
             # TODO: circular
             from . import runtime
-            driver_ver, runtime_ver = driver.get_version(), runtime.get_version()
+            driver_ver, runtime_ver = (
+                driver.get_version(), runtime.get_version()
+            )
             if driver_ver >= (12, 0) and runtime_ver > driver_ver:
                 # runs once
-                return PyNvJitLinker(max_registers, lineinfo, cc, lto, additional_flags)
+                return PyNvJitLinker(
+                    max_registers, lineinfo, cc, lto, additional_flags
+                )
             else:
                 return MVCLinker(max_registers, lineinfo, cc)
 
@@ -2916,6 +2934,7 @@ def complete(self):
         cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
         return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
 
+
 class PyNvJitLinker(Linker):
     def __init__(
         self,
@@ -2925,11 +2944,8 @@ def __init__(
         lto=False,
         additional_flags=None,
     ):
-        try:
-            from pynvjitlink.api import NvJitLinker, NvJitLinkError
-        except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE) from err
-
+        if pynvjitlink_import_err is not None:
+            raise ImportError(_MVC_ERROR_MESSAGE)
         if cc is None:
             raise RuntimeError("PyNvJitLinker requires CC to be specified")
         if not any(isinstance(cc, t) for t in [list, tuple]):
@@ -2997,7 +3013,9 @@ def add_file_guess_ext(self, path_or_code):
         if path_or_code.kind == "cu":
             self.add_cu(path_or_code.data, path_or_code.name)
         else:
-            self.add_data(path_or_code.data, path_or_code.kind, path_or_code.name)
+            self.add_data(
+                path_or_code.data, path_or_code.kind, path_or_code.name
+            )
 
     def add_file(self, path, kind):
         try:
diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
index 5280829..1c1ab4d 100644
--- a/numba_cuda/numba/cuda/cudadrv/linkable_code.py
+++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
@@ -1,5 +1,6 @@
 from .mappings import FILE_EXTENSION_MAP
 
+
 class LinkableCode:
     """An object that can be passed in the `link` list argument to `@cuda.jit`
     kernels to supply code to be linked from memory."""

From 56db9c83d46e84a0aa7b36f6e56a94e018d8adb7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 8 Aug 2024 07:50:10 -0700
Subject: [PATCH 09/52] cleanup

---
 numba_cuda/numba/cuda/runtime/nrt.cu | 107 ---------------------------
 numba_cuda/numba/cuda/target.py      |   1 -
 2 files changed, 108 deletions(-)
 delete mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu

diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
deleted file mode 100644
index 6ced244..0000000
--- a/numba_cuda/numba/cuda/runtime/nrt.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _NRT_H
-#define _NRT_H
-
-#include <cuda/atomic>
-
-typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
-typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
-
-typedef struct MemInfo NRT_MemInfo;
-
-extern "C" {
-struct MemInfo {
-  cuda::atomic<size_t, cuda::thread_scope_device> refct;
-  NRT_dtor_function dtor;
-  void* dtor_info;
-  void* data;
-  size_t size;
-};
-}
-
-// Globally needed variables
-struct NRT_MemSys {
-  struct {
-    bool enabled;
-    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
-    cuda::atomic<size_t, cuda::thread_scope_device> free;
-    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
-    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
-  } stats;
-};
-
-/* The Memory System object */
-__device__ NRT_MemSys* TheMSys;
-
-extern "C" __device__ void* NRT_Allocate(size_t size)
-{
-  void* ptr = NULL;
-  ptr       = malloc(size);
-//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
-  return ptr;
-}
-
-extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
-                                            void* data,
-                                            size_t size,
-                                            NRT_dtor_function dtor,
-                                            void* dtor_info)
-//                                            NRT_MemSys* TheMSys)
-{
-  mi->refct     = 1; /* starts with 1 refct */
-  mi->dtor      = dtor;
-  mi->dtor_info = dtor_info;
-  mi->data      = data;
-  mi->size      = size;
-//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
-}
-
-__device__ NRT_MemInfo* NRT_MemInfo_new(
-  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
-{
-  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
-  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
-  return mi;
-}
-
-extern "C" __device__ void NRT_Free(void* ptr)
-{
-  free(ptr);
-  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
-}
-
-extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
-{
-  NRT_Free(mi);
-}
-
-extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
-{
-  NRT_dealloc(mi);
-  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
-}
-extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
-{
-  if (mi->dtor) /* We have a destructor */
-    mi->dtor(mi->data, mi->size, NULL);
-  /* Clear and release MemInfo */
-  NRT_MemInfo_destroy(mi);
-}
-
-/*
-  c++ version of the NRT_decref function that usually is added to
-  the final kernel link in PTX form by numba. This version may be
-  used by c++ APIs that accept ownership of live objects and must
-  manage them going forward.
-*/
-extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
-{
-  mi->refct--;
-  if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
-}
-
-#endif
-
-extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
-{
-  mi->refct++;
-}
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index f43b099..6402ff3 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -67,7 +67,6 @@ def resolve_value_type(self, val):
 class CUDATargetContext(BaseContext):
     implement_powi_as_math_call = True
     strict_alignment = True
-    enable_nrt = True
 
     def __init__(self, typingctx, target='cuda'):
         super().__init__(typingctx, target)

From c57053c7b94b2b7b9f551eee226c7408c318d534 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 8 Aug 2024 07:51:59 -0700
Subject: [PATCH 10/52] bifurcate error messages

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 70a8c7a..b6048a3 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2654,11 +2654,15 @@ def complete(self):
         """
 
 
-_MVC_ERROR_MESSAGE = (
+_MVC_ERROR_MESSAGE_CU11 = (
     "Minor version compatibility requires ptxcompiler and cubinlinker packages "
     "to be available"
 )
 
+_MVC_ERROR_MESSAGE_CU12 = (
+    "Minor version compatibility requires pynvjitlink package to be available"
+)
+
 
 class MVCLinker(Linker):
     """
@@ -2669,7 +2673,7 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None):
         try:
             from cubinlinker import CubinLinker
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE) from err
+            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
 
         if cc is None:
             raise RuntimeError("MVCLinker requires Compute Capability to be "
@@ -2701,7 +2705,7 @@ def add_ptx(self, ptx, name='<cudapy-ptx>'):
             from ptxcompiler import compile_ptx
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE) from err
+            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
         compile_result = compile_ptx(ptx.decode(), self.ptx_compile_options)
         try:
             self._linker.add_cubin(compile_result.compiled_program, name)
@@ -2712,7 +2716,7 @@ def add_file(self, path, kind):
         try:
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE) from err
+            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
 
         try:
             with open(path, 'rb') as f:
@@ -2741,7 +2745,7 @@ def complete(self):
         try:
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE) from err
+            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
 
         try:
             return self._linker.complete()
@@ -2945,7 +2949,7 @@ def __init__(
         additional_flags=None,
     ):
         if pynvjitlink_import_err is not None:
-            raise ImportError(_MVC_ERROR_MESSAGE)
+            raise ImportError(_MVC_ERROR_MESSAGE_CU12)
         if cc is None:
             raise RuntimeError("PyNvJitLinker requires CC to be specified")
         if not any(isinstance(cc, t) for t in [list, tuple]):

From 363b86dd4e5be5982cfcd85aa9b9858fa3bc5bf2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Aug 2024 06:05:13 -0700
Subject: [PATCH 11/52] partially address reviews

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 64 +++++++++++++++++--------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index b6048a3..0748931 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -19,6 +19,7 @@
 import warnings
 import logging
 import threading
+import traceback
 import asyncio
 import pathlib
 from itertools import product
@@ -64,6 +65,38 @@
     pynvjitlink_import_err = err
 
 
+def _readenv(name, ctor, default):
+    value = os.environ.get(name)
+    if value is None:
+        return default() if callable(default) else default
+    try:
+        return ctor(value)
+    except Exception:
+        warnings.warn(
+            f"Environment variable '{name}' is defined but its associated "
+            f"value '{value}' could not be parsed.\n"
+            "The parse failed with exception:\n"
+            f"{traceback.format_exc()}",
+            RuntimeWarning
+        )
+        return default
+
+
+config.ENABLE_PYNVJITLINK = False
+if _readenv("ENABLE_PYNVJITLINK", bool, False):
+    config.ENABLE_PYNVJITLINK = True
+
+
+_MVC_ERROR_MESSAGE_CU11 = (
+    "Minor version compatibility requires ptxcompiler and cubinlinker packages "
+    "to be available"
+)
+
+_MVC_ERROR_MESSAGE_CU12 = (
+    "Minor version compatibility requires pynvjitlink package to be available"
+)
+
+
 def make_logger():
     logger = logging.getLogger(__name__)
     # is logging configured?
@@ -440,7 +473,7 @@ def get_active_context(self):
 
     def get_version(self):
         """
-        Returns the CUDA Runtime version as a tuple (major, minor).
+        Returns the CUDA Driver version as a tuple (major, minor).
         """
         if USE_NV_BINDING:
             version = driver.cuDriverGetVersion()
@@ -2566,23 +2599,26 @@ def new(cls,
             additional_flags=None
             ):
         if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
-            # TODO: circular
-            from . import runtime
+            from numba.cuda.cudadrv import runtime
             driver_ver, runtime_ver = (
                 driver.get_version(), runtime.get_version()
             )
             if driver_ver >= (12, 0) and runtime_ver > driver_ver:
                 # runs once
-                return PyNvJitLinker(
-                    max_registers, lineinfo, cc, lto, additional_flags
-                )
+                linker = PyNvJitLinker
             else:
-                return MVCLinker(max_registers, lineinfo, cc)
+                linker = MVCLinker
 
         elif USE_NV_BINDING:
-            return CudaPythonLinker(max_registers, lineinfo, cc)
+            linker = CudaPythonLinker
+        else:
+            linker = CtypesLinker
+        if linker is PyNvJitLinker:
+            return linker(max_registers, lineinfo, cc, lto, additional_flags)
+        elif additional_flags or lto:
+            raise ValueError("LTO and additional flags require PyNvJitLinker")
         else:
-            return CtypesLinker(max_registers, lineinfo, cc)
+            return linker(max_registers, lineinfo, cc)
 
     @abstractmethod
     def __init__(self, max_registers, lineinfo, cc):
@@ -2654,16 +2690,6 @@ def complete(self):
         """
 
 
-_MVC_ERROR_MESSAGE_CU11 = (
-    "Minor version compatibility requires ptxcompiler and cubinlinker packages "
-    "to be available"
-)
-
-_MVC_ERROR_MESSAGE_CU12 = (
-    "Minor version compatibility requires pynvjitlink package to be available"
-)
-
-
 class MVCLinker(Linker):
     """
     Linker supporting Minor Version Compatibility, backed by the cubinlinker

From 32164e92e775173abcb077fff97716cc63ec5548 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Aug 2024 07:05:43 -0700
Subject: [PATCH 12/52] move add_file_guess_ext logic to Linker base class

---
 numba_cuda/numba/cuda/cudadrv/driver.py   | 73 ++++++++++-------------
 numba_cuda/numba/cuda/cudadrv/enums.py    |  5 +-
 numba_cuda/numba/cuda/cudadrv/mappings.py |  1 +
 3 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 0748931..fe46630 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2667,19 +2667,38 @@ def add_cu_file(self, path):
             cu = f.read()
         self.add_cu(cu, os.path.basename(path))
 
-    def add_file_guess_ext(self, path):
+    def add_file_guess_ext(self, path_or_code):
         """Add a file to the link, guessing its type from its extension."""
-        ext = os.path.splitext(path)[1][1:]
-        if ext == '':
-            raise RuntimeError("Don't know how to link file with no extension")
-        elif ext == 'cu':
-            self.add_cu_file(path)
-        else:
-            kind = FILE_EXTENSION_MAP.get(ext, None)
-            if kind is None:
-                raise RuntimeError("Don't know how to link file with extension "
-                                   f".{ext}")
-            self.add_file(path, kind)
+        if isinstance(path_or_code, str):
+            ext = pathlib.Path(path_or_code).suffix
+            if ext == '':
+                raise RuntimeError(
+                    "Don't know how to link file with no extension"
+                )
+            elif ext == '.cu':
+                self.add_cu_file(path_or_code)
+            else:
+                kind = FILE_EXTENSION_MAP.get(ext, None)
+                if kind is None:
+                    raise RuntimeError(
+                        "Don't know how to link file with extension "
+                        f".{ext}"
+                    )
+                self.add_file(path_or_code, kind)
+            return
+        else:
+            # Otherwise, we should have been given a LinkableCode object
+            if not isinstance(path_or_code, LinkableCode):
+                raise TypeError(
+                    "Expected path to file or a LinkableCode object"
+                )
+
+            if path_or_code.kind == "cu":
+                self.add_cu(path_or_code.data, path_or_code.name)
+            else:
+                self.add_data(
+                    path_or_code.data, path_or_code.kind, path_or_code.name
+                )
 
     @abstractmethod
     def complete(self):
@@ -3017,36 +3036,6 @@ def add_ltoir(self, ltoir, name="<external-ltoir>"):
     def add_object(self, obj, name="<external-object>"):
         self._linker.add_object(obj, name)
 
-    def add_file_guess_ext(self, path_or_code):
-        # Numba's add_file_guess_ext expects to always be passed a path to a
-        # file that it will load from the filesystem to link. We augment it
-        # here with the ability to provide a file from memory.
-
-        # To maintain compatibility with the original interface, all strings
-        # are treated as paths in the filesystem.
-        if isinstance(path_or_code, str):
-            # Upstream numba does not yet recognize LTOIR, so handle that
-            # separately here.
-            extension = pathlib.Path(path_or_code).suffix
-            if extension == ".ltoir":
-                self.add_file(path_or_code, "ltoir")
-            else:
-                # Use Numba's logic for non-LTOIR
-                super().add_file_guess_ext(path_or_code)
-
-            return
-
-        # Otherwise, we should have been given a LinkableCode object
-        if not isinstance(path_or_code, LinkableCode):
-            raise TypeError("Expected path to file or a LinkableCode object")
-
-        if path_or_code.kind == "cu":
-            self.add_cu(path_or_code.data, path_or_code.name)
-        else:
-            self.add_data(
-                path_or_code.data, path_or_code.kind, path_or_code.name
-            )
-
     def add_file(self, path, kind):
         try:
             with open(path, "rb") as f:
diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py
index 3431cf7..917dbb1 100644
--- a/numba_cuda/numba/cuda/cudadrv/enums.py
+++ b/numba_cuda/numba/cuda/cudadrv/enums.py
@@ -309,7 +309,10 @@
 # Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
 CU_JIT_INPUT_LIBRARY = 4
 
-CU_JIT_NUM_INPUT_TYPES = 6
+# LTO IR
+CU_JIT_INPUT_LTO_IR = 5
+
+CU_JIT_NUM_INPUT_TYPES = 7
 
 
 # Online compiler and linker options
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index 3750324..15fe79b 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -19,4 +19,5 @@
         'lib': enums.CU_JIT_INPUT_LIBRARY,
         'cubin': enums.CU_JIT_INPUT_CUBIN,
         'fatbin': enums.CU_JIT_INPUT_FATBINARY,
+        'ltoir': enums.CU_JIT_INPUT_LTO_IR,
     }

From c3b908403dd9c99ef41b1b5b17cddb9819a7bc41 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 19 Aug 2024 06:50:26 -0700
Subject: [PATCH 13/52] refactor __new__ logic

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 44 ++++++++++++++++++-------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index fe46630..67603c4 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2598,21 +2598,41 @@ def new(cls,
             lto=None,
             additional_flags=None
             ):
-        if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
-            from numba.cuda.cudadrv import runtime
-            driver_ver, runtime_ver = (
-                driver.get_version(), runtime.get_version()
+
+        driver_ver = driver.get_version()
+        if (
+            config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
+            and driver_ver >= (12, 0)
+        ):
+            raise ValueError(
+                "Use ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC"
+            )
+        if config.ENABLE_PYNVJITLINK and driver_ver < (12, 0):
+            raise ValueError(
+                "Use CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY "
+                "for CUDA < 12.0 MVC"
+            )
+        if (
+            config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
+            and config.ENABLE_PYNVJITLINK
+        ):
+            raise ValueError(
+                "can't set both config.ENABLE_PYNVJITLINK "
+                "and config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY "
+                "at the same time"
             )
-            if driver_ver >= (12, 0) and runtime_ver > driver_ver:
-                # runs once
-                linker = PyNvJitLinker
-            else:
-                linker = MVCLinker
 
-        elif USE_NV_BINDING:
-            linker = CudaPythonLinker
+        if config.ENABLE_PYNVJITLINK:
+            linker = PyNvJitLinker
+
+        elif config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
+            linker = MVCLinker
         else:
-            linker = CtypesLinker
+            if USE_NV_BINDING:
+                linker = CudaPythonLinker
+            else:
+                linker = CtypesLinker
+
         if linker is PyNvJitLinker:
             return linker(max_registers, lineinfo, cc, lto, additional_flags)
         elif additional_flags or lto:

From 2c940ee07c41f2baf6803256bcc14969a8663f4b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 19 Aug 2024 06:59:30 -0700
Subject: [PATCH 14/52] address reviews

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 67603c4..9315343 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -82,7 +82,6 @@ def _readenv(name, ctor, default):
         return default
 
 
-config.ENABLE_PYNVJITLINK = False
 if _readenv("ENABLE_PYNVJITLINK", bool, False):
     config.ENABLE_PYNVJITLINK = True
 
@@ -93,7 +92,7 @@ def _readenv(name, ctor, default):
 )
 
 _MVC_ERROR_MESSAGE_CU12 = (
-    "Minor version compatibility requires pynvjitlink package to be available"
+    "Using pynvjitlink requires the pynvjitlink package to be available"
 )
 
 

From a8c38b6613985e9ca90d4a11c299094b18e8e82b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 21 Aug 2024 12:33:27 -0700
Subject: [PATCH 15/52] refactor config logic

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 33 +++++++++++++++----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 9315343..9efac41 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -58,11 +58,7 @@
 _py_decref.argtypes = [ctypes.py_object]
 _py_incref.argtypes = [ctypes.py_object]
 
-pynvjitlink_import_err = None
-try:
-    from pynvjitlink.api import NvJitLinker, NvJitLinkError
-except ImportError as err:
-    pynvjitlink_import_err = err
+_MVC_ERROR_MESSAGE_CU12 = None
 
 
 def _readenv(name, ctor, default):
@@ -82,10 +78,6 @@ def _readenv(name, ctor, default):
         return default
 
 
-if _readenv("ENABLE_PYNVJITLINK", bool, False):
-    config.ENABLE_PYNVJITLINK = True
-
-
 _MVC_ERROR_MESSAGE_CU11 = (
     "Minor version compatibility requires ptxcompiler and cubinlinker packages "
     "to be available"
@@ -95,6 +87,23 @@ def _readenv(name, ctor, default):
     "Using pynvjitlink requires the pynvjitlink package to be available"
 )
 
+ENABLE_PYNVJITLINK = (
+    _readenv("ENABLE_PYNVJITLINK", bool, False)
+    or config.ENABLE_PYNVJITLINK
+)
+
+if ENABLE_PYNVJITLINK:
+    try:
+        from pynvjitlink.api import NvJitLinker, NvJitLinkError
+    except ImportError:
+        raise ImportError(_MVC_ERROR_MESSAGE_CU12)
+
+    if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
+        raise ValueError(
+            "Can't set CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and "
+            "ENABLE_PYNVJITLINK at the same time"
+        )
+
 
 def make_logger():
     logger = logging.getLogger(__name__)
@@ -2608,8 +2617,7 @@ def new(cls,
             )
         if config.ENABLE_PYNVJITLINK and driver_ver < (12, 0):
             raise ValueError(
-                "Use CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY "
-                "for CUDA < 12.0 MVC"
+                "Enabling pynvjitlink requires CUDA 12."
             )
         if (
             config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
@@ -3012,8 +3020,7 @@ def __init__(
         lto=False,
         additional_flags=None,
     ):
-        if pynvjitlink_import_err is not None:
-            raise ImportError(_MVC_ERROR_MESSAGE_CU12)
+
         if cc is None:
             raise RuntimeError("PyNvJitLinker requires CC to be specified")
         if not any(isinstance(cc, t) for t in [list, tuple]):

From 421fdfbefd01acc86022aa15344cab7f90ed38b0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Aug 2024 02:16:52 -0700
Subject: [PATCH 16/52] continue addressing reviews

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 24 +++++++-----------------
 numba_cuda/numba/cuda/cudadrv/enums.py  |  2 +-
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 9efac41..6a16403 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2695,7 +2695,11 @@ def add_cu_file(self, path):
         self.add_cu(cu, os.path.basename(path))
 
     def add_file_guess_ext(self, path_or_code):
-        """Add a file to the link, guessing its type from its extension."""
+        """
+        Add a file or LinkableCode object to the link. If a file is
+        passed, the type will be inferred from the extension. A LinkableCode
+        object represents a file already in memory.
+        """
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
             if ext == '':
@@ -2704,6 +2708,8 @@ def add_file_guess_ext(self, path_or_code):
                 )
             elif ext == '.cu':
                 self.add_cu_file(path_or_code)
+            elif ext == ".ltoir":
+                self.add_file(path_or_code, "ltoir")
             else:
                 kind = FILE_EXTENSION_MAP.get(ext, None)
                 if kind is None:
@@ -3093,22 +3099,6 @@ def add_data(self, data, kind, name):
         except NvJitLinkError as e:
             raise LinkerError from e
 
-    def add_cu(self, cu, name):
-        with driver.get_active_context() as ac:
-            dev = driver.get_device(ac.devnum)
-            cc = dev.compute_capability
-
-        ptx, log = nvrtc.compile(cu, name, cc)
-
-        if config.DUMP_ASSEMBLY:
-            print(("ASSEMBLY %s" % name).center(80, "-"))
-            print(ptx)
-            print("=" * 80)
-
-        # Link the program's PTX using the normal linker mechanism
-        ptx_name = os.path.splitext(name)[0] + ".ptx"
-        self.add_ptx(ptx.encode(), ptx_name)
-
     def complete(self):
         try:
             cubin = self._linker.get_linked_cubin()
diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py
index 917dbb1..25bbbe1 100644
--- a/numba_cuda/numba/cuda/cudadrv/enums.py
+++ b/numba_cuda/numba/cuda/cudadrv/enums.py
@@ -312,7 +312,7 @@
 # LTO IR
 CU_JIT_INPUT_LTO_IR = 5
 
-CU_JIT_NUM_INPUT_TYPES = 7
+CU_JIT_NUM_INPUT_TYPES = 6
 
 
 # Online compiler and linker options

From 16314a76cb3366f8f215521c56efaa29fe27b8f4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Aug 2024 02:19:28 -0700
Subject: [PATCH 17/52] rename errors

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 6a16403..a1a5c0c 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -58,7 +58,7 @@
 _py_decref.argtypes = [ctypes.py_object]
 _py_incref.argtypes = [ctypes.py_object]
 
-_MVC_ERROR_MESSAGE_CU12 = None
+_PYNVJITLINK_ERR_MSG = None
 
 
 def _readenv(name, ctor, default):
@@ -78,12 +78,12 @@ def _readenv(name, ctor, default):
         return default
 
 
-_MVC_ERROR_MESSAGE_CU11 = (
+_MVC_ERROR_MESSAGE = (
     "Minor version compatibility requires ptxcompiler and cubinlinker packages "
     "to be available"
 )
 
-_MVC_ERROR_MESSAGE_CU12 = (
+_PYNVJITLINK_ERR_MSG = (
     "Using pynvjitlink requires the pynvjitlink package to be available"
 )
 
@@ -96,7 +96,7 @@ def _readenv(name, ctor, default):
     try:
         from pynvjitlink.api import NvJitLinker, NvJitLinkError
     except ImportError:
-        raise ImportError(_MVC_ERROR_MESSAGE_CU12)
+        raise ImportError(_PYNVJITLINK_ERR_MSG)
 
     if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
         raise ValueError(
@@ -2751,7 +2751,7 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None):
         try:
             from cubinlinker import CubinLinker
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
+            raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         if cc is None:
             raise RuntimeError("MVCLinker requires Compute Capability to be "
@@ -2783,7 +2783,7 @@ def add_ptx(self, ptx, name='<cudapy-ptx>'):
             from ptxcompiler import compile_ptx
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
+            raise ImportError(_MVC_ERROR_MESSAGE) from err
         compile_result = compile_ptx(ptx.decode(), self.ptx_compile_options)
         try:
             self._linker.add_cubin(compile_result.compiled_program, name)
@@ -2794,7 +2794,7 @@ def add_file(self, path, kind):
         try:
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
+            raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         try:
             with open(path, 'rb') as f:
@@ -2823,7 +2823,7 @@ def complete(self):
         try:
             from cubinlinker import CubinLinkerError
         except ImportError as err:
-            raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err
+            raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         try:
             return self._linker.complete()

From 41d85a9254076887ec14ea6e0ea7edc943956601 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Aug 2024 03:31:25 -0700
Subject: [PATCH 18/52] minor cleanup

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index a1a5c0c..483df13 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -58,8 +58,6 @@
 _py_decref.argtypes = [ctypes.py_object]
 _py_incref.argtypes = [ctypes.py_object]
 
-_PYNVJITLINK_ERR_MSG = None
-
 
 def _readenv(name, ctor, default):
     value = os.environ.get(name)
@@ -83,7 +81,7 @@ def _readenv(name, ctor, default):
     "to be available"
 )
 
-_PYNVJITLINK_ERR_MSG = (
+_PYNVJITLINK_ERR_MESSAGE = (
     "Using pynvjitlink requires the pynvjitlink package to be available"
 )
 
@@ -96,7 +94,7 @@ def _readenv(name, ctor, default):
     try:
         from pynvjitlink.api import NvJitLinker, NvJitLinkError
     except ImportError:
-        raise ImportError(_PYNVJITLINK_ERR_MSG)
+        raise ImportError(_PYNVJITLINK_ERR_MESSAGE)
 
     if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
         raise ValueError(

From f7939b66a49a1c39b3ad3ac707ee745d68c13b99 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 22 Aug 2024 07:10:30 -0500
Subject: [PATCH 19/52] Apply suggestions from code review

Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 numba_cuda/numba/cuda/cudadrv/driver.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 483df13..45f3c72 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2640,7 +2640,7 @@ def new(cls,
 
         if linker is PyNvJitLinker:
             return linker(max_registers, lineinfo, cc, lto, additional_flags)
-        elif additional_flags or lto:
+        elif additional_flags is not None or lto is not None:
             raise ValueError("LTO and additional flags require PyNvJitLinker")
         else:
             return linker(max_registers, lineinfo, cc)
@@ -2706,8 +2706,6 @@ def add_file_guess_ext(self, path_or_code):
                 )
             elif ext == '.cu':
                 self.add_cu_file(path_or_code)
-            elif ext == ".ltoir":
-                self.add_file(path_or_code, "ltoir")
             else:
                 kind = FILE_EXTENSION_MAP.get(ext, None)
                 if kind is None:
@@ -3087,7 +3085,7 @@ def add_data(self, data, kind, name):
             return self.add_ptx(data, name)
         elif kind == FILE_EXTENSION_MAP["o"]:
             fn = self._linker.add_object
-        elif kind == "ltoir":
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
             fn = self._linker.add_ltoir
         else:
             raise LinkerError(f"Don't know how to link {kind}")

From 91f06a89f15161eaa2c0f4fcf563a53fcc4dada6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Aug 2024 05:40:49 -0700
Subject: [PATCH 20/52] address reviews

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 45f3c72..1a633db 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -81,20 +81,22 @@ def _readenv(name, ctor, default):
     "to be available"
 )
 
-_PYNVJITLINK_ERR_MESSAGE = (
-    "Using pynvjitlink requires the pynvjitlink package to be available"
-)
 
 ENABLE_PYNVJITLINK = (
     _readenv("ENABLE_PYNVJITLINK", bool, False)
-    or config.ENABLE_PYNVJITLINK
+    or getattr(config, "ENABLE_PYNVJITLINK", None)
 )
+if not hasattr(config, "ENABLE_PYNVJITLINK"):
+    config.ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
+
 
 if ENABLE_PYNVJITLINK:
     try:
         from pynvjitlink.api import NvJitLinker, NvJitLinkError
     except ImportError:
-        raise ImportError(_PYNVJITLINK_ERR_MESSAGE)
+        raise ImportError(
+            "Using pynvjitlink requires the pynvjitlink package to be available"
+        )
 
     if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
         raise ValueError(
@@ -2617,16 +2619,6 @@ def new(cls,
             raise ValueError(
                 "Enabling pynvjitlink requires CUDA 12."
             )
-        if (
-            config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
-            and config.ENABLE_PYNVJITLINK
-        ):
-            raise ValueError(
-                "can't set both config.ENABLE_PYNVJITLINK "
-                "and config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY "
-                "at the same time"
-            )
-
         if config.ENABLE_PYNVJITLINK:
             linker = PyNvJitLinker
 

From 0541dcfb919f141ed6f8697f7173a0e3f0d87270 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Aug 2024 05:55:11 -0700
Subject: [PATCH 21/52] bug fixes and map ltoir to CU_JIT_INPUT_NVVM

---
 numba_cuda/numba/cuda/cudadrv/driver.py   | 2 +-
 numba_cuda/numba/cuda/cudadrv/mappings.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1a633db..108b92b 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2691,7 +2691,7 @@ def add_file_guess_ext(self, path_or_code):
         object represents a file already in memory.
         """
         if isinstance(path_or_code, str):
-            ext = pathlib.Path(path_or_code).suffix
+            ext = pathlib.Path(path_or_code).suffix.lstrip('.')
             if ext == '':
                 raise RuntimeError(
                     "Don't know how to link file with no extension"
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index 15fe79b..a552af1 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -10,6 +10,7 @@
         'lib': jitty.CU_JIT_INPUT_LIBRARY,
         'cubin': jitty.CU_JIT_INPUT_CUBIN,
         'fatbin': jitty.CU_JIT_INPUT_FATBINARY,
+        'ltoir': jitty.CU_JIT_INPUT_NVVM,
     }
 else:
     FILE_EXTENSION_MAP = {

From 710f8cb16395d6fe8c566091658dfd65c5a1cb80 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 4 Sep 2024 05:02:44 -0700
Subject: [PATCH 22/52] CU_JIT_INPUT_LTO_IR -> CU_JIT_INPUT_NVVM

---
 numba_cuda/numba/cuda/cudadrv/enums.py    | 2 +-
 numba_cuda/numba/cuda/cudadrv/mappings.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py
index 25bbbe1..0b1b155 100644
--- a/numba_cuda/numba/cuda/cudadrv/enums.py
+++ b/numba_cuda/numba/cuda/cudadrv/enums.py
@@ -310,7 +310,7 @@
 CU_JIT_INPUT_LIBRARY = 4
 
 # LTO IR
-CU_JIT_INPUT_LTO_IR = 5
+CU_JIT_INPUT_NVVM = 5
 
 CU_JIT_NUM_INPUT_TYPES = 6
 
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index a552af1..541b04c 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -20,5 +20,5 @@
         'lib': enums.CU_JIT_INPUT_LIBRARY,
         'cubin': enums.CU_JIT_INPUT_CUBIN,
         'fatbin': enums.CU_JIT_INPUT_FATBINARY,
-        'ltoir': enums.CU_JIT_INPUT_LTO_IR,
+        'ltoir': enums.CU_JIT_INPUT_NVVM,
     }

From a8cb6c223e07cea44333f7a99d022c488e0fca71 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 4 Sep 2024 05:03:37 -0700
Subject: [PATCH 23/52] only use cuda if CUDA_USE_NVIDIA_BINDING

---
 numba_cuda/numba/cuda/cudadrv/mappings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index 541b04c..95d369e 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -1,7 +1,7 @@
 from numba import config
-from cuda import cuda
 from . import enums
 if config.CUDA_USE_NVIDIA_BINDING:
+    from cuda import cuda
     jitty = cuda.CUjitInputType
     FILE_EXTENSION_MAP = {
         'o': jitty.CU_JIT_INPUT_OBJECT,

From b8b671f3a6016939f67e4f5a991666d2a8ec67ed Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 4 Sep 2024 06:21:04 -0700
Subject: [PATCH 24/52] fixes

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 108b92b..225e0df 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2691,7 +2691,7 @@ def add_file_guess_ext(self, path_or_code):
         object represents a file already in memory.
         """
         if isinstance(path_or_code, str):
-            ext = pathlib.Path(path_or_code).suffix.lstrip('.')
+            ext = pathlib.Path(path_or_code).suffix
             if ext == '':
                 raise RuntimeError(
                     "Don't know how to link file with no extension"
@@ -2699,11 +2699,11 @@ def add_file_guess_ext(self, path_or_code):
             elif ext == '.cu':
                 self.add_cu_file(path_or_code)
             else:
-                kind = FILE_EXTENSION_MAP.get(ext, None)
+                kind = FILE_EXTENSION_MAP.get(ext.lstrip('.'), None)
                 if kind is None:
                     raise RuntimeError(
                         "Don't know how to link file with extension "
-                        f".{ext}"
+                        f"{ext}"
                     )
                 self.add_file(path_or_code, kind)
             return

From 7c384a3b202f8e2e93514edb48d5c372b70a5f74 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 26 Sep 2024 09:27:36 -0700
Subject: [PATCH 25/52] tests

---
 .gitignore                                    |   2 +
 numba_cuda/numba/cuda/codegen.py              |  18 +-
 numba_cuda/numba/cuda/dispatcher.py           |   4 +-
 numba_cuda/numba/cuda/target.py               |   6 +-
 .../numba/cuda/tests/cudadrv/test_linker.py   |   5 -
 .../cuda/tests/cudadrv/test_nvjitlink.py      | 181 ++++++++++++++++++
 .../tests/test_binary_generation/Makefile     |  54 ++++++
 .../generate_raw_ltoir.py                     | 163 ++++++++++++++++
 .../test_device_functions.cu                  |  19 ++
 .../undefined_extern.cu                       |   3 +
 10 files changed, 442 insertions(+), 13 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
 create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
 create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
 create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu
 create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu

diff --git a/.gitignore b/.gitignore
index 55b4f8b..99271f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ __pycache__
 build
 .*.swp
 *.so
+numba_cuda/numba/cuda/tests/cudadrv/test_device_functions.*
+numba_cuda/numba/cuda/tests/cudadrv/undefined_extern.*
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index 6009dcb..0b25444 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -59,11 +59,18 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
     get_cufunc), which may be of different compute capabilities.
     """
 
-    def __init__(self, codegen, name, entry_name=None, max_registers=None,
-                 nvvm_options=None):
+    def __init__(
+        self,
+        codegen,
+        name,
+        entry_name=None,
+        max_registers=None,
+        lto=False,
+        nvvm_options=None
+    ):
         """
         codegen:
-            Codegen object.
+        Codegen object.
         name:
             Name of the function in the source.
         entry_name:
@@ -103,6 +110,7 @@ def __init__(self, codegen, name, entry_name=None, max_registers=None,
         self._cufunc_cache = {}
 
         self._max_registers = max_registers
+        self._lto = lto
         if nvvm_options is None:
             nvvm_options = {}
         self._nvvm_options = nvvm_options
@@ -178,7 +186,9 @@ def get_cubin(self, cc=None):
         if cubin:
             return cubin
 
-        linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
+        linker = driver.Linker.new(
+            max_registers=self._max_registers, cc=cc, lto=self._lto
+        )
 
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 16680a2..332fe15 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -46,7 +46,7 @@ class _Kernel(serialize.ReduceMixin):
     @global_compiler_lock
     def __init__(self, py_func, argtypes, link=None, debug=False,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
-                 max_registers=None, opt=True, device=False):
+                 max_registers=None, lto=False, opt=True, device=False):
 
         if device:
             raise RuntimeError('Cannot compile a device function as a kernel')
@@ -94,7 +94,7 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
                                                   debug, lineinfo, nvvm_options,
                                                   filename, linenum,
-                                                  max_registers)
+                                                  max_registers, lto)
 
         if not link:
             link = []
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 6402ff3..98d4311 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -148,7 +148,7 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
 
     def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
                             nvvm_options, filename, linenum,
-                            max_registers=None):
+                            max_registers=None, lto=False):
         """
         Adapt a code library ``codelib`` with the numba compiled CUDA kernel
         with name ``fname`` and arguments ``argtypes`` for NVVM.
@@ -175,7 +175,9 @@ def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
         library = self.codegen().create_library(f'{codelib.name}_kernel_',
                                                 entry_name=kernel_name,
                                                 nvvm_options=nvvm_options,
-                                                max_registers=max_registers)
+                                                max_registers=max_registers,
+                                                lto=lto
+                                                )
         library.add_linking_library(codelib)
         wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name,
                                                debug, lineinfo, filename,
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
index 22e2ee8..31d142d 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
@@ -1,6 +1,5 @@
 import numpy as np
 import warnings
-from numba.cuda.testing import unittest
 from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
 from numba.cuda.testing import CUDATestCase, test_data_dir
 from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
@@ -311,7 +310,3 @@ def test_get_local_mem_per_specialized(self):
         local_mem_size = compiled_specialized.get_local_mem_per_thread()
         calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
         self.assertGreaterEqual(local_mem_size, calc_size)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
new file mode 100644
index 0000000..e528e7f
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -0,0 +1,181 @@
+from numba.cuda.testing import unittest
+from numba.cuda.testing import skip_on_cudasim
+from numba.cuda.testing import CUDATestCase
+from numba.cuda.cudadrv.driver import PyNvJitLinker
+
+import itertools
+from pynvjitlink.api import NvJitLinkError
+from numba.cuda import get_current_device
+from numba import cuda
+from numba import config
+
+HAVE_PYNVJITLINK = False
+try:
+    import pynvjitlink  # noqa: F401
+
+    HAVE_PYNVJITLINK = True
+except ImportError:
+    pass
+
+
+@unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available")
+@skip_on_cudasim("Linking unsupported in the simulator")
+class TestLinker(CUDATestCase):
+    _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
+
+    def test_nvjitlink_create(self):
+        patched_linker = PyNvJitLinker(cc=(7, 5))
+        assert "-arch=sm_75" in patched_linker.options
+
+    def test_nvjitlink_create_no_cc_error(self):
+        # nvJitLink expects at least the architecture to be specified.
+        with self.assertRaisesRegex(
+            RuntimeError, "PyNvJitLinker requires CC to be specified"
+        ):
+            PyNvJitLinker()
+
+    def test_nvjitlink_invalid_arch_error(self):
+        # CC 0.0 is not a valid compute capability
+        with self.assertRaisesRegex(
+            NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"
+        ):
+            PyNvJitLinker(cc=(0, 0))
+
+    def test_nvjitlink_invalid_cc_type_error(self):
+        with self.assertRaisesRegex(
+            TypeError, "`cc` must be a list or tuple of length 2"
+        ):
+            PyNvJitLinker(cc=0)
+
+    def test_nvjitlink_ptx_compile_options(self):
+
+        max_registers = (None, 32)
+        lineinfo = (False, True)
+        lto = (False, True)
+        additional_flags = (None, ("-g",), ("-g", "-time"))
+        for (
+            max_registers_i,
+            line_info_i,
+            lto_i,
+            additional_flags_i,
+        ) in itertools.product(max_registers, lineinfo, lto, additional_flags):
+            with self.subTest(
+                max_registers=max_registers_i,
+                lineinfo=line_info_i,
+                lto=lto_i,
+                additional_flags=additional_flags_i,
+            ):
+                patched_linker = PyNvJitLinker(
+                    cc=(7, 5),
+                    max_registers=max_registers_i,
+                    lineinfo=line_info_i,
+                    lto=lto_i,
+                    additional_flags=additional_flags_i,
+                )
+                assert "-arch=sm_75" in patched_linker.options
+
+                if max_registers_i:
+                    assert (
+                        f"-maxrregcount={max_registers_i}"
+                        in patched_linker.options
+                    )
+                else:
+                    assert "-maxrregcount" not in patched_linker.options
+
+                if line_info_i:
+                    assert "-lineinfo" in patched_linker.options
+                else:
+                    assert "-lineinfo" not in patched_linker.options
+
+                if lto_i:
+                    assert "-lto" in patched_linker.options
+                else:
+                    assert "-lto" not in patched_linker.options
+
+                if additional_flags_i:
+                    for flag in additional_flags_i:
+                        assert flag in patched_linker.options
+
+    def test_nvjitlink_add_file_guess_ext_linkable_code(self):
+        files = (
+            "test_device_functions.a",
+            "test_device_functions.cubin",
+            "test_device_functions.cu",
+            "test_device_functions.fatbin",
+            "test_device_functions.o",
+            "test_device_functions.ptx",
+        )
+        for file in files:
+            with self.subTest(file=file):
+                patched_linker = PyNvJitLinker(
+                    cc=get_current_device().compute_capability
+                )
+                patched_linker.add_file_guess_ext(file)
+
+    def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
+        with open("test_device_functions.cubin", "rb") as f:
+            content = f.read()
+
+        patched_linker = PyNvJitLinker(
+            cc=get_current_device().compute_capability
+        )
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+            # Feeding raw data as bytes to add_file_guess_ext should raise,
+            # because there's no way to know what kind of file to treat it as
+            patched_linker.add_file_guess_ext(content)
+
+    @unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available")
+    def test_nvjitlink_jit_with_linkable_code(self):
+        files = (
+            "test_device_functions.a",
+            "test_device_functions.cubin",
+            "test_device_functions.cu",
+            "test_device_functions.fatbin",
+            "test_device_functions.o",
+            "test_device_functions.ptx",
+        )
+        for file in files:
+            with self.subTest(file=file):
+                # TODO: unsafe teardown if test errors
+                config.ENABLE_PYNVJITLINK = True
+                sig = "uint32(uint32, uint32)"
+                add_from_numba = cuda.declare_device("add_from_numba", sig)
+
+                @cuda.jit(link=[file])
+                def kernel(result):
+                    result[0] = add_from_numba(1, 2)
+
+                result = cuda.device_array(1)
+                kernel[1, 1](result)
+                assert result[0] == 3
+
+    def test_nvjitlink_jit_with_linkable_code_lto(self):
+        file = "test_device_functions.ltoir"
+
+        sig = "uint32(uint32, uint32)"
+        add_from_numba = cuda.declare_device("add_from_numba", sig)
+
+        @cuda.jit(link=[file], lto=True)
+        def kernel(result):
+            result[0] = add_from_numba(1, 2)
+
+        result = cuda.device_array(1)
+        kernel[1, 1](result)
+        assert result[0] == 3
+
+    def test_nvjitlink_jit_with_invalid_linkable_code(self):
+        with open("test_device_functions.cubin", "rb") as f:
+            content = f.read()
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+
+            @cuda.jit("void()", link=[content])
+            def kernel():
+                pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
new file mode 100644
index 0000000..adde5e3
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
@@ -0,0 +1,54 @@
+# Generates the input files used by the pynvjitlink binding test suite
+
+# Test binaries are built taking into account the CC of the GPU in the test machine
+GPU_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv | grep -v compute_cap | head -n 1 | sed 's/\.//')
+GPU_CC ?= 75
+
+# Use CC 7.0 as an alternative in fatbin testing, unless CC is 7.x
+ifeq ($(shell echo "$(GPU_CC)" | cut -c1),7)
+    ALT_CC := 80
+else
+    ALT_CC := 70
+endif
+
+# Gencode flags suitable for most tests
+GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC)
+
+# Fatbin tests need to generate code for an additional compute capability
+FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC)
+
+# LTO-IR tests need to generate for the LTO "architecture" instead
+LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC)
+
+# Compile with optimization; use relocatable device code to preserve device
+# functions in the final output
+NVCC_FLAGS := -O3 -rdc true
+
+# Flags specific to output type
+CUBIN_FLAGS := $(GENCODE) --cubin
+PTX_FLAGS := $(GENCODE) -ptx
+OBJECT_FLAGS := $(GENCODE) -dc
+LIBRARY_FLAGS := $(GENCODE) -lib
+FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin
+LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
+
+OUTPUT_DIR := ../cudadrv
+
+all:
+	@echo "GPU CC: $(GPU_CC)"
+	@echo "Alternative CC: $(ALT_CC)"
+	# Compile all test objects
+	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu
+	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
+
+	# Generate LTO-IR wrapped in a fatbin
+	nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
+	# Generate LTO-IR in a "raw" LTO-IR container
+	python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
+	# We also want to test linking a .cu file; this needs no compilation,
+	# so copy it instead
+	cp test_device_functions.cu $(OUTPUT_DIR)
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
new file mode 100644
index 0000000..8e1d6ec
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import argparse
+import pathlib
+import subprocess
+import sys
+
+from cuda import nvrtc
+
+# Magic number found at the start of an LTO-IR file
+LTOIR_MAGIC = 0x7F4E43ED
+
+
+def check(args):
+    """
+    Abort and print an error message in the presence of an error result.
+
+    Otherwise:
+    - Return None if there were no more arguments,
+    - Return the singular argument if there was only one further argument,
+    - Return the tuple of arguments if multiple followed.
+    """
+
+    result, *args = args
+    value = result.value
+
+    if value:
+        error_string = check(nvrtc.nvrtcGetErrorString(result)).decode()
+        msg = f"NVRTC error, code {value}: {error_string}"
+        print(msg, file=sys.stderr)
+        sys.exit(1)
+
+    if len(args) == 0:
+        return None
+    elif len(args) == 1:
+        return args[0]
+    else:
+        return args
+
+
+def determine_include_flags():
+    # Inspired by the logic in FindCUDAToolkit.cmake. We need the CUDA include
+    # paths because NVRTC doesn't add them by default, and we can compile a
+    # much broader set of test files if the CUDA includes are available.
+
+    # We invoke NVCC in verbose mode ("-v") and give a dummy filename, without
+    # which it won't produce output.
+
+    cmd = ["nvcc", "-v", "__dummy"]
+    cp = subprocess.run(cmd, capture_output=True)
+
+    # Since the dummy file doesn't actually exist, NVCC is expected to exit
+    # with an error code of 1.
+    rc = cp.returncode
+    if rc != 1:
+        print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.")
+        return None
+
+    output = cp.stderr.decode()
+    lines = output.splitlines()
+
+    includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")]
+    if len(includes_lines) != 1:
+        print(f"Expected exactly one INCLUDES line. Got {len(includes_lines)}.")
+        return None
+
+    # Parse out the arguments following "INCLUDES=" - these are a space
+    # separated list of strings that are potentially quoted.
+
+    quoted_flags = includes_lines[0].split("INCLUDES=")[1].strip().split()
+    include_flags = [flag.strip('"') for flag in quoted_flags]
+    print(f"Using CUDA include flags: {include_flags}")
+
+    return include_flags
+
+
+def get_ltoir(source, name, arch):
+    """Given a CUDA C/C++ source, compile it and return the LTO-IR."""
+
+    program = check(
+        nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
+    )
+
+    cuda_include_flags = determine_include_flags()
+    if cuda_include_flags is None:
+        print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    options = [
+        f"--gpu-architecture={arch}",
+        "-dlto",
+        "-rdc",
+        "true",
+        *cuda_include_flags,
+    ]
+    options = [o.encode() for o in options]
+
+    result = nvrtc.nvrtcCompileProgram(program, len(options), options)
+
+    # Report compilation errors back to the user
+    if result[0] == nvrtc.nvrtcResult.NVRTC_ERROR_COMPILATION:
+        log_size = check(nvrtc.nvrtcGetProgramLogSize(program))
+        log = b" " * log_size
+        check(nvrtc.nvrtcGetProgramLog(program, log))
+        print("NVRTC compilation error:\n", file=sys.stderr)
+        print(log.decode(), file=sys.stderr)
+        sys.exit(1)
+
+    # Handle other errors in the standard way
+    check(result)
+
+    ltoir_size = check(nvrtc.nvrtcGetLTOIRSize(program))
+    ltoir = b" " * ltoir_size
+    check(nvrtc.nvrtcGetLTOIR(program, ltoir))
+
+    # Check that the output looks like an LTO-IR container
+    header = int.from_bytes(ltoir[:4], byteorder="little")
+    if header != LTOIR_MAGIC:
+        print(
+            f"Unexpected header value 0x{header:X}.\n"
+            f"Expected LTO-IR magic number 0x{LTOIR_MAGIC:X}."
+            "\nExiting.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    return ltoir
+
+
+def main(sourcepath, outputpath, arch):
+    with open(sourcepath) as f:
+        source = f.read()
+
+    name = pathlib.Path(sourcepath).name
+    ltoir = get_ltoir(source, name, arch)
+
+    print(f"Writing {outputpath}...")
+
+    with open(outputpath, "wb") as f:
+        f.write(ltoir)
+
+
+if __name__ == "__main__":
+    description = "Compiles CUDA C/C++ to LTO-IR using NVRTC."
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument("sourcepath", help="path to source file")
+    parser.add_argument(
+        "-o", "--output", help="path to output file", default=None
+    )
+    parser.add_argument(
+        "-a",
+        "--arch",
+        help="compute arch to target (e.g. sm_87). " "Defaults to sm_50.",
+        default="sm_50",
+    )
+
+    args = parser.parse_args()
+    outputpath = args.output
+
+    if outputpath is None:
+        outputpath = pathlib.Path(args.sourcepath).with_suffix(".ltoir")
+
+    main(args.sourcepath, outputpath, args.arch)
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu b/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu
new file mode 100644
index 0000000..f1499dc
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu
@@ -0,0 +1,19 @@
+#include <cuda_fp16.h>
+
+extern __device__ bool __heq(__half arg1, __half arg2);
+
+__device__ __half test_add_fp16(__half arg1, __half arg2) {
+  return __hadd(arg1, arg2);
+}
+
+__device__ bool test_cmp_fp16(__half arg1, __half arg2) {
+  return __heq(arg1, arg2);
+}
+
+typedef unsigned int uint32_t;
+
+extern "C" __device__ int add_from_numba(uint32_t *result, uint32_t a,
+                                         uint32_t b) {
+  *result = a + b;
+  return 0;
+}
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu b/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu
new file mode 100644
index 0000000..9e0a0ca
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu
@@ -0,0 +1,3 @@
+extern __device__ float undef(float a, float b);
+
+__global__ void f(float *r, float *a, float *b) { r[0] = undef(a[0], b[0]); }

From 6327ec2752412667019df0c6da2dd99d7a2f9293 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 30 Sep 2024 12:07:27 -0700
Subject: [PATCH 26/52] fix bug

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 225e0df..d33c716 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -2632,7 +2632,7 @@ def new(cls,
 
         if linker is PyNvJitLinker:
             return linker(max_registers, lineinfo, cc, lto, additional_flags)
-        elif additional_flags is not None or lto is not None:
+        elif additional_flags is not None or lto is True:
             raise ValueError("LTO and additional flags require PyNvJitLinker")
         else:
             return linker(max_registers, lineinfo, cc)

From c97767c7a0ac21341e981ef50165aeaa87dd23a7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 30 Sep 2024 12:31:24 -0700
Subject: [PATCH 27/52] add a new ci job for testing with pynvjitlink

---
 .github/workflows/pr.yaml | 25 +++++++++++++++++++++++--
 ci/test_conda.sh          |  9 ++++++++-
 ci/test_wheel.sh          |  9 ++++++++-
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7498eb1..d4c54d5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -54,9 +54,20 @@ jobs:
     uses: ./.github/workflows/conda-python-tests.yaml
     with:
       build_type: pull-request
-      script: "ci/test_conda.sh"
+      script: "ci/test_conda.sh false"
       run_codecov: false
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
+  test-conda-pynvjitlink:
+    needs:
+      - build-conda
+      - compute-matrix
+    uses: ./.github/workflows/conda-python-tests.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_conda.sh true"
+      run_codecov: false
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
   build-wheels:
     needs:
       - compute-matrix
@@ -72,8 +83,18 @@ jobs:
     uses: ./.github/workflows/wheels-test.yaml
     with:
       build_type: pull-request
-      script: "ci/test_wheel.sh"
+      script: "ci/test_wheel.sh false"
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
+  test-wheels-pynvjitlink:
+    needs:
+      - build-wheels
+      - compute-matrix
+    uses: ./.github/workflows/wheels-test.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_wheel.sh true"
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
   build-docs:
     needs:
       - build-conda
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 5861128..a932630 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+USE_PYNVJITLINK = $1
+
 . /opt/conda/etc/profile.d/conda.sh
 
 rapids-logger "Install testing dependencies"
@@ -42,8 +44,13 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+if [ "$USE_PYNVJITLINK" == true ]; then
+    rapids-logger "Install pynvjitlink"
+    conda install -c rapidsai pynvjitlink
+fi
+
 rapids-logger "Run Tests"
-python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v
 
 popd
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 5ffd014..7121517 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+USE_PYNVJITLINK = $1
+
 rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator
 python -m pip install \
@@ -10,6 +12,11 @@ python -m pip install \
     cuda-python \
     pytest
 
+if [ "$USE_PYNVJITLINK" == true ]; then
+    rapids-logger "Install pynvjitlink"
+    python -m pip install pynvjitlink
+fi
+
 rapids-logger "Install wheel"
 package=$(realpath wheel/numba_cuda*.whl)
 echo "Package path: $package"
@@ -26,6 +33,6 @@ rapids-logger "Show Numba system info"
 python -m numba --sysinfo
 
 rapids-logger "Run Tests"
-python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v
 
 popd

From aa3aaf7af3668e71615ed58119c9d94480398430 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 30 Sep 2024 12:34:54 -0700
Subject: [PATCH 28/52] fixes

---
 .github/workflows/pr.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d4c54d5..9b806ce 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,8 +19,10 @@ jobs:
       - compute-matrix
       - build-conda
       - test-conda
+      - test-conda-pynvjitlink
       - build-wheels
       - test-wheels
+      - test-wheels-pynvjitlink
       - build-docs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08

From f01c0d6bd630f08065f7f4d7c642d0c76ee5ab60 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 2 Oct 2024 10:36:28 -0700
Subject: [PATCH 29/52] more small fixes

---
 ci/test_conda.sh | 2 +-
 ci/test_wheel.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index a932630..eb423ab 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -3,7 +3,7 @@
 
 set -euo pipefail
 
-USE_PYNVJITLINK = $1
+USE_PYNVJITLINK=$1
 
 . /opt/conda/etc/profile.d/conda.sh
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 7121517..734cedb 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -3,7 +3,7 @@
 
 set -euo pipefail
 
-USE_PYNVJITLINK = $1
+USE_PYNVJITLINK=$1
 
 rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator

From 519f0c15255286985033c7db78c32d7521a82595 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 06:27:56 -0700
Subject: [PATCH 30/52] update matrix filter

---
 .github/workflows/pr.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 5b7f790..820d51c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -69,7 +69,7 @@ jobs:
       script: "ci/test_conda.sh true"
       run_codecov: false
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber))
   build-wheels:
     needs:
       - compute-matrix
@@ -95,7 +95,7 @@ jobs:
       build_type: pull-request
       script: "ci/test_wheel.sh true"
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber))
   build-docs:
     needs:
       - build-conda

From 1201f1f670d98fc366c9943cab303e36c6487d88 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 06:57:39 -0700
Subject: [PATCH 31/52] simple filter

---
 .github/workflows/pr.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 820d51c..bbdba68 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -69,7 +69,8 @@ jobs:
       script: "ci/test_conda.sh true"
       run_codecov: false
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber))
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
+
   build-wheels:
     needs:
       - compute-matrix
@@ -95,7 +96,8 @@ jobs:
       build_type: pull-request
       script: "ci/test_wheel.sh true"
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber))
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
+
   build-docs:
     needs:
       - build-conda

From 883d81741eb992de559dafa781906e7ecf583bf2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 06:57:57 -0700
Subject: [PATCH 32/52] clean

---
 .github/workflows/pr.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index bbdba68..1672742 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -70,7 +70,6 @@ jobs:
       run_codecov: false
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
-
   build-wheels:
     needs:
       - compute-matrix
@@ -97,7 +96,6 @@ jobs:
       script: "ci/test_wheel.sh true"
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
-
   build-docs:
     needs:
       - build-conda

From f41b931302aa43a99b59fd035b889e032fa22aa9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 07:08:41 -0700
Subject: [PATCH 33/52] .1

---
 .github/workflows/pr.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1672742..5afa741 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -69,7 +69,7 @@ jobs:
       script: "ci/test_conda.sh true"
       run_codecov: false
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-wheels:
     needs:
       - compute-matrix
@@ -95,7 +95,7 @@ jobs:
       build_type: pull-request
       script: "ci/test_wheel.sh true"
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11"))
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-docs:
     needs:
       - build-conda

From e5aa41e9feb230a0d472edf1508eb11743cc3354 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 07:23:35 -0700
Subject: [PATCH 34/52] revert

---
 .github/workflows/pr.yaml | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 5afa741..a8a6d0a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,10 +19,8 @@ jobs:
       - compute-matrix
       - build-conda
       - test-conda
-      - test-conda-pynvjitlink
       - build-wheels
       - test-wheels
-      - test-wheels-pynvjitlink
       - build-docs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
@@ -59,17 +57,6 @@ jobs:
       script: "ci/test_conda.sh false"
       run_codecov: false
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-  test-conda-pynvjitlink:
-    needs:
-      - build-conda
-      - compute-matrix
-    uses: ./.github/workflows/conda-python-tests.yaml
-    with:
-      build_type: pull-request
-      script: "ci/test_conda.sh true"
-      run_codecov: false
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-wheels:
     needs:
       - compute-matrix
@@ -85,17 +72,6 @@ jobs:
     with:
       build_type: pull-request
       script: "ci/test_wheel.sh false"
-      matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-  test-wheels-pynvjitlink:
-    needs:
-      - build-wheels
-      - compute-matrix
-    uses: ./.github/workflows/wheels-test.yaml
-    with:
-      build_type: pull-request
-      script: "ci/test_wheel.sh true"
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-docs:
     needs:
       - build-conda

From a0ea97d9618cda957cda0afad7f65ffbfcb5eb0f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 07:51:42 -0700
Subject: [PATCH 35/52] refactor

---
 .github/workflows/pr.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a8a6d0a..39066be 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,8 +19,10 @@ jobs:
       - compute-matrix
       - build-conda
       - test-conda
+      - test-conda-pynvjitlink
       - build-wheels
       - test-wheels
+      - test-wheels-pynvjitlink
       - build-docs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
@@ -57,6 +59,16 @@ jobs:
       script: "ci/test_conda.sh false"
       run_codecov: false
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
+  test-conda-pynvjitlink:
+    needs:
+      - build-conda
+      - compute-matrix
+    uses: ./.github/workflows/conda-python-tests.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_conda.sh true"
+      run_codecov: false
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
   build-wheels:
     needs:
       - compute-matrix
@@ -72,6 +84,14 @@ jobs:
     with:
       build_type: pull-request
       script: "ci/test_wheel.sh false"
+  test-wheels-pynvjitlink:
+    needs:
+      - build-wheels
+    uses: ./.github/workflows/wheels-test.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_wheel.sh true"
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
   build-docs:
     needs:
       - build-conda

From b979054b62a1e3d8165212330d09f1ca8b3f2b24 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 08:02:53 -0700
Subject: [PATCH 36/52] try and fix conda workflow

---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 39066be..03650aa 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -68,7 +68,7 @@ jobs:
       build_type: pull-request
       script: "ci/test_conda.sh true"
       run_codecov: false
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
+      matrix_filter: echo ${{ needs.compute-matrix.outputs.TEST_MATRIX }} | jq 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))'
   build-wheels:
     needs:
       - compute-matrix

From beb330125d18126ee1c9176052208e55df48fef1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 08:51:47 -0700
Subject: [PATCH 37/52] readenv boolify string values

---
 numba_cuda/numba/cuda/cudadrv/driver.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index d33c716..44cdbee 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -64,6 +64,8 @@ def _readenv(name, ctor, default):
     if value is None:
         return default() if callable(default) else default
     try:
+        if ctor is bool:
+            return bool(value.lower() in {'1', "True"})
         return ctor(value)
     except Exception:
         warnings.warn(
@@ -81,7 +83,6 @@ def _readenv(name, ctor, default):
     "to be available"
 )
 
-
 ENABLE_PYNVJITLINK = (
     _readenv("ENABLE_PYNVJITLINK", bool, False)
     or getattr(config, "ENABLE_PYNVJITLINK", None)

From 3f5a86535083d64bf7cdfba745966fd132e8488e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 13:18:03 -0700
Subject: [PATCH 38/52] fix imports

---
 numba_cuda/numba/cuda/tests/cudadrv/test_linker.py    | 5 +++++
 numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
index 31d142d..22e2ee8 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
@@ -1,5 +1,6 @@
 import numpy as np
 import warnings
+from numba.cuda.testing import unittest
 from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
 from numba.cuda.testing import CUDATestCase, test_data_dir
 from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
@@ -310,3 +311,7 @@ def test_get_local_mem_per_specialized(self):
         local_mem_size = compiled_specialized.get_local_mem_per_thread()
         calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
         self.assertGreaterEqual(local_mem_size, calc_size)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index e528e7f..4145e31 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -4,7 +4,6 @@
 from numba.cuda.cudadrv.driver import PyNvJitLinker
 
 import itertools
-from pynvjitlink.api import NvJitLinkError
 from numba.cuda import get_current_device
 from numba import cuda
 from numba import config
@@ -12,6 +11,7 @@
 HAVE_PYNVJITLINK = False
 try:
     import pynvjitlink  # noqa: F401
+    from pynvjitlink.api import NvJitLinkError
 
     HAVE_PYNVJITLINK = True
 except ImportError:

From 4770c409ef81c768148c14ac55c48bc7d5e6444c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 17:08:55 -0700
Subject: [PATCH 39/52] update

---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 03650aa..39066be 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -68,7 +68,7 @@ jobs:
       build_type: pull-request
       script: "ci/test_conda.sh true"
       run_codecov: false
-      matrix_filter: echo ${{ needs.compute-matrix.outputs.TEST_MATRIX }} | jq 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))'
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
   build-wheels:
     needs:
       - compute-matrix

From 2820ee6e313f76ead0222feb6e57bd1f0a7971ae Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Oct 2024 17:42:21 -0700
Subject: [PATCH 40/52] use local workflow matrix

---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 39066be..0d6f948 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -68,7 +68,7 @@ jobs:
       build_type: pull-request
       script: "ci/test_conda.sh true"
       run_codecov: false
-      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-wheels:
     needs:
       - compute-matrix

From dc20cced941311967a803f62d450969837143917 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 4 Oct 2024 10:58:58 -0700
Subject: [PATCH 41/52] cu12 suffix

---
 ci/test_wheel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 734cedb..afdc8ec 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -14,7 +14,7 @@ python -m pip install \
 
 if [ "$USE_PYNVJITLINK" == true ]; then
     rapids-logger "Install pynvjitlink"
-    python -m pip install pynvjitlink
+    python -m pip install pynvjitlink-cu12
 fi
 
 rapids-logger "Install wheel"

From ff18c5c77662a09f7713b3eeb263863098e3e226 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 7 Oct 2024 09:56:44 -0500
Subject: [PATCH 42/52] Update ci/test_conda.sh

Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 ci/test_conda.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 0a23461..ccdff4e 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -51,7 +51,9 @@ set +e
 
 if [ "$USE_PYNVJITLINK" == true ]; then
     rapids-logger "Install pynvjitlink"
+    set +u
     conda install -c rapidsai pynvjitlink
+    set -u
 fi
 
 rapids-logger "Run Tests"

From b2f4245e99d3fc84114f06539ca984a6dde93f24 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 8 Oct 2024 08:37:55 -0700
Subject: [PATCH 43/52] small updates

---
 numba_cuda/numba/cuda/cudadrv/driver.py          |  4 +---
 .../numba/cuda/tests/cudadrv/test_nvjitlink.py   | 16 +++-------------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 44cdbee..5a3a55a 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -10,7 +10,6 @@
 system to freeze in some cases.
 
 """
-
 import sys
 import os
 import ctypes
@@ -85,12 +84,11 @@ def _readenv(name, ctor, default):
 
 ENABLE_PYNVJITLINK = (
     _readenv("ENABLE_PYNVJITLINK", bool, False)
-    or getattr(config, "ENABLE_PYNVJITLINK", None)
+    or getattr(config, "ENABLE_PYNVJITLINK", False)
 )
 if not hasattr(config, "ENABLE_PYNVJITLINK"):
     config.ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
 
-
 if ENABLE_PYNVJITLINK:
     try:
         from pynvjitlink.api import NvJitLinker, NvJitLinkError
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 4145e31..7aa1b03 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -8,17 +8,8 @@
 from numba import cuda
 from numba import config
 
-HAVE_PYNVJITLINK = False
-try:
-    import pynvjitlink  # noqa: F401
-    from pynvjitlink.api import NvJitLinkError
 
-    HAVE_PYNVJITLINK = True
-except ImportError:
-    pass
-
-
-@unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available")
+@unittest.skipIf(config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled")
 @skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
     _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
@@ -35,6 +26,8 @@ def test_nvjitlink_create_no_cc_error(self):
             PyNvJitLinker()
 
     def test_nvjitlink_invalid_arch_error(self):
+        from pynvjitlink.api import NvJitLinkError
+
         # CC 0.0 is not a valid compute capability
         with self.assertRaisesRegex(
             NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"
@@ -126,7 +119,6 @@ def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
             # because there's no way to know what kind of file to treat it as
             patched_linker.add_file_guess_ext(content)
 
-    @unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available")
     def test_nvjitlink_jit_with_linkable_code(self):
         files = (
             "test_device_functions.a",
@@ -138,8 +130,6 @@ def test_nvjitlink_jit_with_linkable_code(self):
         )
         for file in files:
             with self.subTest(file=file):
-                # TODO: unsafe teardown if test errors
-                config.ENABLE_PYNVJITLINK = True
                 sig = "uint32(uint32, uint32)"
                 add_from_numba = cuda.declare_device("add_from_numba", sig)
 

From f40d3ed5a82b2a0aa51d79d5f17414c34a68379a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 8 Oct 2024 13:22:28 -0700
Subject: [PATCH 44/52] fix logic :)

---
 numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 7aa1b03..79b3e71 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -9,7 +9,7 @@
 from numba import config
 
 
-@unittest.skipIf(config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled")
+@unittest.skipIf(not config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled")
 @skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
     _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}

From c24ec67a62bd8ef4f68b6563db3c5fe81d09c69b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 8 Oct 2024 14:28:33 -0700
Subject: [PATCH 45/52] try hardcoding ENABLE_PYNVJITLINK

---
 ci/test_conda.sh | 2 +-
 ci/test_wheel.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index ccdff4e..83ca848 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -57,7 +57,7 @@ if [ "$USE_PYNVJITLINK" == true ]; then
 fi
 
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
 
 popd
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index afdc8ec..a9f346c 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -33,6 +33,6 @@ rapids-logger "Show Numba system info"
 python -m numba --sysinfo
 
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
 
 popd

From dccd6dbe58ae8e78abccab03e0d17a065a8de810 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 8 Oct 2024 18:24:14 -0700
Subject: [PATCH 46/52] fix passing of ENABLE_PYNVJITLINK

---
 ci/test_conda.sh | 6 +++++-
 ci/test_wheel.sh | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 83ca848..4002e67 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-USE_PYNVJITLINK=$1
+if [ "$1" == "true" ]; then
+    USE_PYNVJITLINK=true
+else
+    USE_PYNVJITLINK=false
+fi
 
 . /opt/conda/etc/profile.d/conda.sh
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index a9f346c..b6ee2d1 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-USE_PYNVJITLINK=$1
+if [ "$1" == "true" ]; then
+    USE_PYNVJITLINK=true
+else
+    USE_PYNVJITLINK=false
+fi
 
 rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator

From 9aaa21f80e0bf2f29709e2c7f9ea247c1dd89080 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Oct 2024 08:23:41 -0700
Subject: [PATCH 47/52] ship makefile, find and build tests

---
 ci/build_tests.sh | 14 ++++++++++++++
 ci/test_conda.sh  |  1 +
 ci/test_wheel.sh  |  1 +
 pyproject.toml    |  2 +-
 4 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100755 ci/build_tests.sh

diff --git a/ci/build_tests.sh b/ci/build_tests.sh
new file mode 100755
index 0000000..8bf6fe3
--- /dev/null
+++ b/ci/build_tests.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION
+
+PY_SCRIPT="
+import numba_cuda
+root = numba_cuda.__file__.rstrip('__init__.py')
+test_dir = root + numba/cuda/tests/test_binary_generation/
+print(test_dir)
+"
+
+TEST_DIR=$(python -c "$PY_SCRIPT")
+pushd $TEST_DIR
+make
+popd
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 4002e67..200e91f 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -58,6 +58,7 @@ if [ "$USE_PYNVJITLINK" == true ]; then
     set +u
     conda install -c rapidsai pynvjitlink
     set -u
+    sh build_tests.sh
 fi
 
 rapids-logger "Run Tests"
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index b6ee2d1..a69b801 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -19,6 +19,7 @@ python -m pip install \
 if [ "$USE_PYNVJITLINK" == true ]; then
     rapids-logger "Install pynvjitlink"
     python -m pip install pynvjitlink-cu12
+    sh build_tests.sh
 fi
 
 rapids-logger "Install wheel"
diff --git a/pyproject.toml b/pyproject.toml
index 4f2291b..e7aefdb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,4 +36,4 @@ license-files = ["LICENSE"]
 include = ["numba_cuda*"]
 
 [tool.setuptools.package-data]
-"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION"]
+"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION", "Makefile"]

From d3ca53ccd381b25d60ac072fa91865fc5d3564cc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Oct 2024 08:33:01 -0700
Subject: [PATCH 48/52] minor fix

---
 ci/build_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_tests.sh b/ci/build_tests.sh
index 8bf6fe3..9d0f65d 100755
--- a/ci/build_tests.sh
+++ b/ci/build_tests.sh
@@ -4,7 +4,7 @@
 PY_SCRIPT="
 import numba_cuda
 root = numba_cuda.__file__.rstrip('__init__.py')
-test_dir = root + numba/cuda/tests/test_binary_generation/
+test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
 print(test_dir)
 "
 

From 4ce95a73038868e75e8215bbd82d81a4390d23e2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Oct 2024 11:14:03 -0700
Subject: [PATCH 49/52] bifurcate pynvjitlink test scripts

---
 .github/workflows/pr.yaml    |  6 +--
 ci/build_tests.sh            | 14 -------
 ci/test_conda.sh             | 16 +-------
 ci/test_conda_pynvjitlink.sh | 77 ++++++++++++++++++++++++++++++++++++
 ci/test_wheel.sh             | 14 +------
 ci/test_wheel_pynvjitlink.sh | 47 ++++++++++++++++++++++
 6 files changed, 129 insertions(+), 45 deletions(-)
 delete mode 100755 ci/build_tests.sh
 create mode 100755 ci/test_conda_pynvjitlink.sh
 create mode 100755 ci/test_wheel_pynvjitlink.sh

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0d6f948..8ef0c4e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -56,7 +56,7 @@ jobs:
     uses: ./.github/workflows/conda-python-tests.yaml
     with:
       build_type: pull-request
-      script: "ci/test_conda.sh false"
+      script: "ci/test_conda.sh"
       run_codecov: false
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
   test-conda-pynvjitlink:
@@ -66,7 +66,7 @@ jobs:
     uses: ./.github/workflows/conda-python-tests.yaml
     with:
       build_type: pull-request
-      script: "ci/test_conda.sh true"
+      script: "ci/test_conda_pynvjitlink.sh"
       run_codecov: false
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-wheels:
@@ -90,7 +90,7 @@ jobs:
     uses: ./.github/workflows/wheels-test.yaml
     with:
       build_type: pull-request
-      script: "ci/test_wheel.sh true"
+      script: "ci/test_wheel_pynvjitlink.sh"
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
   build-docs:
     needs:
diff --git a/ci/build_tests.sh b/ci/build_tests.sh
deleted file mode 100755
index 9d0f65d..0000000
--- a/ci/build_tests.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION
-
-PY_SCRIPT="
-import numba_cuda
-root = numba_cuda.__file__.rstrip('__init__.py')
-test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
-print(test_dir)
-"
-
-TEST_DIR=$(python -c "$PY_SCRIPT")
-pushd $TEST_DIR
-make
-popd
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 200e91f..d6e8bc0 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -3,12 +3,6 @@
 
 set -euo pipefail
 
-if [ "$1" == "true" ]; then
-    USE_PYNVJITLINK=true
-else
-    USE_PYNVJITLINK=false
-fi
-
 . /opt/conda/etc/profile.d/conda.sh
 
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
@@ -53,16 +47,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-if [ "$USE_PYNVJITLINK" == true ]; then
-    rapids-logger "Install pynvjitlink"
-    set +u
-    conda install -c rapidsai pynvjitlink
-    set -u
-    sh build_tests.sh
-fi
-
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+python -m numba.runtests numba.cuda.tests -v
 
 popd
 
diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
new file mode 100755
index 0000000..a0ea90f
--- /dev/null
+++ b/ci/test_conda_pynvjitlink.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+if [ "${CUDA_VER%.*.*}" = "11" ]; then
+  CTK_PACKAGES="cudatoolkit"
+else
+  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
+fi
+
+rapids-logger "Install testing dependencies"
+# TODO: Replace with rapids-dependency-file-generator
+rapids-mamba-retry create -n test \
+    c-compiler \
+    cxx-compiler \
+    ${CTK_PACKAGES} \
+    cuda-python \
+    cuda-version=${CUDA_VER%.*} \
+    make \
+    psutil \
+    pytest \
+    python=${RAPIDS_PY_VERSION}
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test
+set -u
+
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+pushd "${RAPIDS_TESTS_DIR}"
+
+rapids-print-env
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+rapids-logger "Show Numba system info"
+python -m numba --sysinfo
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
+
+rapids-logger "Install pynvjitlink"
+set +u
+conda install -c rapidsai pynvjitlink
+set -u
+
+rapids_logger "Build tests"
+
+PY_SCRIPT="
+import numba_cuda
+root = numba_cuda.__file__.rstrip('__init__.py')
+test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
+print(test_dir)
+"
+
+TEST_DIR=$(python -c "$PY_SCRIPT")
+pushd $TEST_DIR
+make
+popd
+
+
+rapids-logger "Run Tests"
+ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+
+popd
+
+rapids-logger "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index a69b801..5ffd014 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -3,12 +3,6 @@
 
 set -euo pipefail
 
-if [ "$1" == "true" ]; then
-    USE_PYNVJITLINK=true
-else
-    USE_PYNVJITLINK=false
-fi
-
 rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator
 python -m pip install \
@@ -16,12 +10,6 @@ python -m pip install \
     cuda-python \
     pytest
 
-if [ "$USE_PYNVJITLINK" == true ]; then
-    rapids-logger "Install pynvjitlink"
-    python -m pip install pynvjitlink-cu12
-    sh build_tests.sh
-fi
-
 rapids-logger "Install wheel"
 package=$(realpath wheel/numba_cuda*.whl)
 echo "Package path: $package"
@@ -38,6 +26,6 @@ rapids-logger "Show Numba system info"
 python -m numba --sysinfo
 
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+python -m numba.runtests numba.cuda.tests -v
 
 popd
diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh
new file mode 100755
index 0000000..4d3bb57
--- /dev/null
+++ b/ci/test_wheel_pynvjitlink.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION
+
+set -euo pipefail
+
+rapids-logger "Install testing dependencies"
+# TODO: Replace with rapids-dependency-file-generator
+python -m pip install \
+    psutil \
+    cuda-python \
+    pytest
+
+rapids-logger "Install pynvjitlink"
+python -m pip install pynvjitlink-cu12
+
+rapids-logger "Build tests"
+PY_SCRIPT="
+import numba_cuda
+root = numba_cuda.__file__.rstrip('__init__.py')
+test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
+print(test_dir)
+"
+
+TEST_DIR=$(python -c "$PY_SCRIPT")
+pushd $TEST_DIR
+make
+popd
+
+rapids-logger "Install wheel"
+package=$(realpath wheel/numba_cuda*.whl)
+echo "Package path: $package"
+python -m pip install $package
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+pushd "${RAPIDS_TESTS_DIR}"
+
+rapids-logger "Show Numba system info"
+python -m numba --sysinfo
+
+rapids-logger "Run Tests"
+ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+
+popd

From d65f80dcbb9942eb1c4d2faee7ec4661c72d3fd2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Oct 2024 11:49:05 -0700
Subject: [PATCH 50/52] pass test bin dir as an env var

---
 ci/test_conda_pynvjitlink.sh                  |  8 +--
 ci/test_wheel_pynvjitlink.sh                  |  6 +-
 .../cuda/tests/cudadrv/test_nvjitlink.py      | 61 +++++++++++++------
 .../tests/test_binary_generation/Makefile     |  2 +-
 4 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
index a0ea90f..ca3bef2 100755
--- a/ci/test_conda_pynvjitlink.sh
+++ b/ci/test_conda_pynvjitlink.sh
@@ -53,7 +53,7 @@ set +u
 conda install -c rapidsai pynvjitlink
 set -u
 
-rapids_logger "Build tests"
+rapids-logger "Build tests"
 
 PY_SCRIPT="
 import numba_cuda
@@ -62,14 +62,14 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
 print(test_dir)
 "
 
-TEST_DIR=$(python -c "$PY_SCRIPT")
-pushd $TEST_DIR
+NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT")
+pushd $NUMBA_CUDA_TEST_BIN_DIR
 make
 popd
 
 
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
 
 popd
 
diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh
index 4d3bb57..0838bda 100755
--- a/ci/test_wheel_pynvjitlink.sh
+++ b/ci/test_wheel_pynvjitlink.sh
@@ -21,8 +21,8 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
 print(test_dir)
 "
 
-TEST_DIR=$(python -c "$PY_SCRIPT")
-pushd $TEST_DIR
+NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT")
+pushd $NUMBA_CUDA_TEST_BIN_DIR
 make
 popd
 
@@ -42,6 +42,6 @@ rapids-logger "Show Numba system info"
 python -m numba --sysinfo
 
 rapids-logger "Run Tests"
-ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v
+ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
 
 popd
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 79b3e71..36c216a 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -4,12 +4,39 @@
 from numba.cuda.cudadrv.driver import PyNvJitLinker
 
 import itertools
+import os
 from numba.cuda import get_current_device
 from numba import cuda
 from numba import config
 
-
-@unittest.skipIf(not config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled")
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+test_device_functions_a = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.a"
+)
+test_device_functions_cubin = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.cubin"
+)
+test_device_functions_cu = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.cu"
+)
+test_device_functions_fatbin = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.fatbin"
+)
+test_device_functions_o = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.o"
+)
+test_device_functions_ptx = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.ptx"
+)
+test_device_functions_ltoir = os.path.join(
+    TEST_BIN_DIR, "test_device_functions.ltoir"
+)
+
+
+@unittest.skipIf(
+    not config.ENABLE_PYNVJITLINK or not TEST_BIN_DIR,
+    "pynvjitlink not enabled"
+)
 @skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
     _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
@@ -91,12 +118,12 @@ def test_nvjitlink_ptx_compile_options(self):
 
     def test_nvjitlink_add_file_guess_ext_linkable_code(self):
         files = (
-            "test_device_functions.a",
-            "test_device_functions.cubin",
-            "test_device_functions.cu",
-            "test_device_functions.fatbin",
-            "test_device_functions.o",
-            "test_device_functions.ptx",
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_cu,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
         )
         for file in files:
             with self.subTest(file=file):
@@ -106,7 +133,7 @@ def test_nvjitlink_add_file_guess_ext_linkable_code(self):
                 patched_linker.add_file_guess_ext(file)
 
     def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
-        with open("test_device_functions.cubin", "rb") as f:
+        with open(test_device_functions_cubin, "rb") as f:
             content = f.read()
 
         patched_linker = PyNvJitLinker(
@@ -121,12 +148,12 @@ def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
 
     def test_nvjitlink_jit_with_linkable_code(self):
         files = (
-            "test_device_functions.a",
-            "test_device_functions.cubin",
-            "test_device_functions.cu",
-            "test_device_functions.fatbin",
-            "test_device_functions.o",
-            "test_device_functions.ptx",
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_cu,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
         )
         for file in files:
             with self.subTest(file=file):
@@ -142,7 +169,7 @@ def kernel(result):
                 assert result[0] == 3
 
     def test_nvjitlink_jit_with_linkable_code_lto(self):
-        file = "test_device_functions.ltoir"
+        file = test_device_functions_ltoir
 
         sig = "uint32(uint32, uint32)"
         add_from_numba = cuda.declare_device("add_from_numba", sig)
@@ -156,7 +183,7 @@ def kernel(result):
         assert result[0] == 3
 
     def test_nvjitlink_jit_with_invalid_linkable_code(self):
-        with open("test_device_functions.cubin", "rb") as f:
+        with open(test_device_functions_cubin, "rb") as f:
             content = f.read()
         with self.assertRaisesRegex(
             TypeError, "Expected path to file or a LinkableCode"
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
index adde5e3..31a91b2 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
@@ -32,7 +32,7 @@ LIBRARY_FLAGS := $(GENCODE) -lib
 FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin
 LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
 
-OUTPUT_DIR := ../cudadrv
+OUTPUT_DIR := ./
 
 all:
 	@echo "GPU CC: $(GPU_CC)"

From 5097bcf58715d30bbf3de77c858f34ef96f5bd3f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Oct 2024 12:13:29 -0700
Subject: [PATCH 51/52] more minor fixes

---
 .../cuda/tests/cudadrv/test_nvjitlink.py      | 43 ++++++++++---------
 .../tests/test_binary_generation/Makefile     |  4 +-
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 36c216a..e9ff671 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -10,27 +10,28 @@
 from numba import config
 
 TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
-test_device_functions_a = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.a"
-)
-test_device_functions_cubin = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.cubin"
-)
-test_device_functions_cu = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.cu"
-)
-test_device_functions_fatbin = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.fatbin"
-)
-test_device_functions_o = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.o"
-)
-test_device_functions_ptx = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.ptx"
-)
-test_device_functions_ltoir = os.path.join(
-    TEST_BIN_DIR, "test_device_functions.ltoir"
-)
+if TEST_BIN_DIR:
+    test_device_functions_a = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.a"
+    )
+    test_device_functions_cubin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cubin"
+    )
+    test_device_functions_cu = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cu"
+    )
+    test_device_functions_fatbin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.fatbin"
+    )
+    test_device_functions_o = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.o"
+    )
+    test_device_functions_ptx = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ptx"
+    )
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
 
 
 @unittest.skipIf(
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
index 31a91b2..3985b4e 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
@@ -49,6 +49,4 @@ all:
 	nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
 	# Generate LTO-IR in a "raw" LTO-IR container
 	python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
-	# We also want to test linking a .cu file; this needs no compilation,
-	# so copy it instead
-	cp test_device_functions.cu $(OUTPUT_DIR)
+

From e29744cbb834b6a85bbd582a594d396aca5e359d Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:05:08 +0100
Subject: [PATCH 52/52] Retry installation of pynvjitlink

---
 ci/test_conda_pynvjitlink.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
index ca3bef2..71e41e4 100755
--- a/ci/test_conda_pynvjitlink.sh
+++ b/ci/test_conda_pynvjitlink.sh
@@ -50,7 +50,7 @@ set +e
 
 rapids-logger "Install pynvjitlink"
 set +u
-conda install -c rapidsai pynvjitlink
+rapids-mamba-retry install -c rapidsai pynvjitlink
 set -u
 
 rapids-logger "Build tests"