From 521be20ef9cc94a256b985f8954c673fed2bf620 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 12 Jul 2024 11:47:11 -0700 Subject: [PATCH 01/52] off the ground --- numba_cuda/numba/cuda/codegen.py | 12 +++ numba_cuda/numba/cuda/cudadrv/nrt.py | 126 +++++++++++++++++++++++++++ numba_cuda/numba/cuda/runtime/nrt.cu | 123 ++++++++++++++++++++++++++ numba_cuda/numba/cuda/target.py | 1 + 4 files changed, 262 insertions(+) create mode 100644 numba_cuda/numba/cuda/cudadrv/nrt.py create mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 6009dcb..cd52d95 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -4,6 +4,7 @@ from numba.core.codegen import Codegen, CodeLibrary from .cudadrv import devices, driver, nvvm, runtime from numba.cuda.cudadrv.libs import get_cudalib +import functools import os import subprocess @@ -12,6 +13,14 @@ CUDA_TRIPLE = 'nvptx64-nvidia-cuda' +@functools.cache +def make_nrt(): + from numba.cuda import descriptor + from numba.cuda.cudadrv.nrt import compile_nrt_functions + + library = compile_nrt_functions(descriptor.cuda_target.target_context) + numba_cuda_runtime = library.get_asm_str().encode() + return numba_cuda_runtime def run_nvdisasm(cubin, flags): # nvdisasm only accepts input from a file, so we need to write out to a @@ -180,6 +189,9 @@ def get_cubin(self, cc=None): linker = driver.Linker.new(max_registers=self._max_registers, cc=cc) +# numba_cuda_runtime = make_nrt() + # linker.add_ptx(numba_cuda_runtime) + if linker.lto: ltoir = self.get_ltoir(cc=cc) linker.add_ltoir(ltoir) diff --git a/numba_cuda/numba/cuda/cudadrv/nrt.py b/numba_cuda/numba/cuda/cudadrv/nrt.py new file mode 100644 index 0000000..7aad320 --- /dev/null +++ b/numba_cuda/numba/cuda/cudadrv/nrt.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from llvmlite import ir +from numba.core import cgutils, config +from numba.core.runtime.nrtdynmod import ( + _define_atomic_cas, + _define_atomic_inc_dec, + _define_nrt_incref, + _define_nrt_meminfo_data, + _pointer_type, + incref_decref_ty, +) + + + + +def _define_nrt_decref(module, atomic_decr): + """ + Implement NRT_decref in the module + """ + fn_decref = cgutils.get_or_insert_function( + module, incref_decref_ty, "NRT_decref" + ) + # Cannot inline this for refcount pruning to work + + fn_decref.attributes.add("noinline") + calldtor = ir.Function( + module, + ir.FunctionType(ir.VoidType(), [_pointer_type, _pointer_type]), + name="NRT_MemInfo_call_dtor", + ) + + builder = ir.IRBuilder(fn_decref.append_basic_block()) + [ptr] = fn_decref.args + is_null = builder.icmp_unsigned( + "==", ptr, cgutils.get_null_value(ptr.type) + ) + with cgutils.if_unlikely(builder, is_null): + builder.ret_void() + + # For memory fence usage, see https://llvm.org/docs/Atomics.html + + # A release fence is used before the relevant write operation. + # No-op on x86. On POWER, it lowers to lwsync. + # builder.fence("release") + + word_ptr = builder.bitcast(ptr, atomic_decr.args[0].type) + + if config.DEBUG_NRT: + cgutils.printf( + builder, "*** NRT_Decref %zu [%p]\n", builder.load(word_ptr), ptr + ) + newrefct = builder.call( + atomic_decr, [word_ptr] # nvvmutils.atomic_dec_int64, etc + ) + + refct_eq_0 = builder.icmp_unsigned( + "==", newrefct, ir.Constant(newrefct.type, 0) + ) + with cgutils.if_unlikely(builder, refct_eq_0): + # An acquire fence is used after the relevant read operation. + # No-op on x86. On POWER, it lowers to lwsync. + # builder.fence("acquire") + builder.call( + calldtor, + [ + ptr, + ir.Constant.inttoptr( + ir.Constant(ir.IntType(64), memsys), _pointer_type + ), + ], + ) + builder.ret_void() + + +def create_nrt_module(ctx): + """ + Create an IR module defining the LLVM NRT functions. + A (IR module, library) tuple is returned. + """ + codegen = ctx.codegen() + library = codegen.create_library("nrt") + + # Implement LLVM module with atomic ops + ir_mod = library.create_ir_module("nrt_module") + + atomic_inc = _define_atomic_inc_dec(ir_mod, "add", ordering="monotonic") + atomic_dec = _define_atomic_inc_dec(ir_mod, "sub", ordering="monotonic") + _define_atomic_cas(ir_mod, ordering="monotonic") + + _define_nrt_meminfo_data(ir_mod) + _define_nrt_incref(ir_mod, atomic_inc) + _define_nrt_decref(ir_mod, atomic_dec) + + # _define_nrt_unresolved_abort(ctx, ir_mod) + return ir_mod, library + + +def compile_nrt_functions(ctx): + """ + Compile all LLVM NRT functions and return a library containing them. + The library is created using the given target context. + """ + ir_mod, library = create_nrt_module(ctx) + + library.add_ir_module(ir_mod) + library.finalize() + breakpoint() + + return library + + +# compile NRT functions written in lowering into a PTX string +#library = compile_nrt_functions(descriptor.cuda_target.target_context) +#_cuda_nrt_library = compile_nrt_functions( +# descriptor.cuda_target.target_context +#) +#numba_cuda_runtime = _cuda_nrt_library._get_ptxes()[0].encode() + +def NRT_MemSys_new(): + # TODO + pass + + +memsys = NRT_MemSys_new() +numba_cuda_runtime=None diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu new file mode 100644 index 0000000..f538d3e --- /dev/null +++ b/numba_cuda/numba/cuda/runtime/nrt.cu @@ -0,0 +1,123 @@ + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _NRT_H +#define _NRT_H + +#include + +typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info); +typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info); + +typedef struct MemInfo NRT_MemInfo; + +extern "C" { +struct MemInfo { + cuda::atomic refct; + NRT_dtor_function dtor; + void* dtor_info; + void* data; + size_t size; +}; +} + +// Globally needed variables +struct NRT_MemSys { + struct { + bool enabled; + cuda::atomic alloc; + cuda::atomic free; + cuda::atomic mi_alloc; + cuda::atomic mi_free; + } stats; +}; + +/* The Memory System object */ +__device__ NRT_MemSys* TheMSys; + +extern "C" __device__ void* NRT_Allocate(size_t size) +{ + void* ptr = NULL; + ptr = malloc(size); +// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; } + return ptr; +} + +extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, + void* data, + size_t size, + NRT_dtor_function dtor, + void* dtor_info) +// NRT_MemSys* TheMSys) +{ + mi->refct = 1; /* starts with 1 refct */ + mi->dtor = dtor; + mi->dtor_info = dtor_info; + mi->data = data; + mi->size = size; +// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; } +} + +__device__ NRT_MemInfo* NRT_MemInfo_new( + void* data, size_t size, NRT_dtor_function dtor, void* dtor_info) +{ + NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo)); + if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); } + return mi; +} + +extern "C" __device__ void NRT_Free(void* ptr) +{ + free(ptr); + //if (TheMSys->stats.enabled) { TheMSys->stats.free++; } +} + +extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) +{ + NRT_Free(mi); +} + +extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi) +{ + NRT_dealloc(mi); + //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; } +} +extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) +{ + if (mi->dtor) /* We have a destructor */ + mi->dtor(mi->data, mi->size, NULL); + /* Clear and release MemInfo */ + NRT_MemInfo_destroy(mi); +} + +/* + c++ version of the NRT_decref function that usually is added to + the final kernel link in PTX form by numba. This version may be + used by c++ APIs that accept ownership of live objects and must + manage them going forward. +*/ +extern "C" __device__ void NRT_decref(NRT_MemInfo* mi) +{ + mi->refct--; + if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); } +} + +#endif + +extern "C" __device__ void NRT_incref(NRT_MemInfo* mi) +{ + mi->refct++; +} diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 6402ff3..f43b099 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -67,6 +67,7 @@ def resolve_value_type(self, val): class CUDATargetContext(BaseContext): implement_powi_as_math_call = True strict_alignment = True + enable_nrt = True def __init__(self, typingctx, target='cuda'): super().__init__(typingctx, target) From 0f9bc4a1810e671fac96915112107c9df10d03ef Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 15 Jul 2024 08:08:50 -0700 Subject: [PATCH 02/52] cleanup --- numba_cuda/numba/cuda/codegen.py | 12 --- numba_cuda/numba/cuda/cudadrv/nrt.py | 126 --------------------------- numba_cuda/numba/cuda/runtime/nrt.cu | 16 ---- 3 files changed, 154 deletions(-) delete mode 100644 numba_cuda/numba/cuda/cudadrv/nrt.py diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index cd52d95..6009dcb 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -4,7 +4,6 @@ from numba.core.codegen import Codegen, CodeLibrary from .cudadrv import devices, driver, nvvm, runtime from numba.cuda.cudadrv.libs import get_cudalib -import functools import os import subprocess @@ -13,14 +12,6 @@ CUDA_TRIPLE = 'nvptx64-nvidia-cuda' -@functools.cache -def make_nrt(): - from numba.cuda import descriptor - from numba.cuda.cudadrv.nrt import compile_nrt_functions - - library = compile_nrt_functions(descriptor.cuda_target.target_context) - numba_cuda_runtime = library.get_asm_str().encode() - return numba_cuda_runtime def run_nvdisasm(cubin, flags): # nvdisasm only accepts input from a file, so we need to write out to a @@ -189,9 +180,6 @@ def get_cubin(self, cc=None): linker = driver.Linker.new(max_registers=self._max_registers, cc=cc) -# numba_cuda_runtime = make_nrt() - # linker.add_ptx(numba_cuda_runtime) - if linker.lto: ltoir = self.get_ltoir(cc=cc) linker.add_ltoir(ltoir) diff --git a/numba_cuda/numba/cuda/cudadrv/nrt.py b/numba_cuda/numba/cuda/cudadrv/nrt.py deleted file mode 100644 index 7aad320..0000000 --- a/numba_cuda/numba/cuda/cudadrv/nrt.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from llvmlite import ir -from numba.core import cgutils, config -from numba.core.runtime.nrtdynmod import ( - _define_atomic_cas, - _define_atomic_inc_dec, - _define_nrt_incref, - _define_nrt_meminfo_data, - _pointer_type, - incref_decref_ty, -) - - - - -def _define_nrt_decref(module, atomic_decr): - """ - Implement NRT_decref in the module - """ - fn_decref = cgutils.get_or_insert_function( - module, incref_decref_ty, "NRT_decref" - ) - # Cannot inline this for refcount pruning to work - - fn_decref.attributes.add("noinline") - calldtor = ir.Function( - module, - ir.FunctionType(ir.VoidType(), [_pointer_type, _pointer_type]), - name="NRT_MemInfo_call_dtor", - ) - - builder = ir.IRBuilder(fn_decref.append_basic_block()) - [ptr] = fn_decref.args - is_null = builder.icmp_unsigned( - "==", ptr, cgutils.get_null_value(ptr.type) - ) - with cgutils.if_unlikely(builder, is_null): - builder.ret_void() - - # For memory fence usage, see https://llvm.org/docs/Atomics.html - - # A release fence is used before the relevant write operation. - # No-op on x86. On POWER, it lowers to lwsync. - # builder.fence("release") - - word_ptr = builder.bitcast(ptr, atomic_decr.args[0].type) - - if config.DEBUG_NRT: - cgutils.printf( - builder, "*** NRT_Decref %zu [%p]\n", builder.load(word_ptr), ptr - ) - newrefct = builder.call( - atomic_decr, [word_ptr] # nvvmutils.atomic_dec_int64, etc - ) - - refct_eq_0 = builder.icmp_unsigned( - "==", newrefct, ir.Constant(newrefct.type, 0) - ) - with cgutils.if_unlikely(builder, refct_eq_0): - # An acquire fence is used after the relevant read operation. - # No-op on x86. On POWER, it lowers to lwsync. - # builder.fence("acquire") - builder.call( - calldtor, - [ - ptr, - ir.Constant.inttoptr( - ir.Constant(ir.IntType(64), memsys), _pointer_type - ), - ], - ) - builder.ret_void() - - -def create_nrt_module(ctx): - """ - Create an IR module defining the LLVM NRT functions. - A (IR module, library) tuple is returned. - """ - codegen = ctx.codegen() - library = codegen.create_library("nrt") - - # Implement LLVM module with atomic ops - ir_mod = library.create_ir_module("nrt_module") - - atomic_inc = _define_atomic_inc_dec(ir_mod, "add", ordering="monotonic") - atomic_dec = _define_atomic_inc_dec(ir_mod, "sub", ordering="monotonic") - _define_atomic_cas(ir_mod, ordering="monotonic") - - _define_nrt_meminfo_data(ir_mod) - _define_nrt_incref(ir_mod, atomic_inc) - _define_nrt_decref(ir_mod, atomic_dec) - - # _define_nrt_unresolved_abort(ctx, ir_mod) - return ir_mod, library - - -def compile_nrt_functions(ctx): - """ - Compile all LLVM NRT functions and return a library containing them. - The library is created using the given target context. - """ - ir_mod, library = create_nrt_module(ctx) - - library.add_ir_module(ir_mod) - library.finalize() - breakpoint() - - return library - - -# compile NRT functions written in lowering into a PTX string -#library = compile_nrt_functions(descriptor.cuda_target.target_context) -#_cuda_nrt_library = compile_nrt_functions( -# descriptor.cuda_target.target_context -#) -#numba_cuda_runtime = _cuda_nrt_library._get_ptxes()[0].encode() - -def NRT_MemSys_new(): - # TODO - pass - - -memsys = NRT_MemSys_new() -numba_cuda_runtime=None diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu index f538d3e..6ced244 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.cu +++ b/numba_cuda/numba/cuda/runtime/nrt.cu @@ -1,19 +1,3 @@ - -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ #ifndef _NRT_H #define _NRT_H From 1c3517f943c744f126bed148a806671dead5bc25 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 29 Jul 2024 06:31:15 -0700 Subject: [PATCH 03/52] enough to launch a kernel --- numba_cuda/numba/cuda/cudadrv/driver.py | 208 +++++++++++++++++++++++- 1 file changed, 207 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 875497e..7ec704d 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2596,7 +2596,14 @@ class Linker(metaclass=ABCMeta): @classmethod def new(cls, max_registers=0, lineinfo=False, cc=None): if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: - return MVCLinker(max_registers, lineinfo, cc) + # TODO: circular + from . import runtime + driver_ver, runtime_ver = driver.get_version(), runtime.get_version() + if driver_ver >= (12, 0) and runtime_ver > driver_ver: + return PatchedLinker(max_registers, lineinfo, cc) + else: + return MVCLinker(max_registers, lineinfo, cc) + elif USE_NV_BINDING: return CudaPythonLinker(max_registers, lineinfo, cc) else: @@ -2952,6 +2959,205 @@ def complete(self): cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char)) return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,))) +class LinkableCode: + """An object that can be passed in the `link` list argument to `@cuda.jit` + kernels to supply code to be linked from memory.""" + + def __init__(self, data, name=None): + self.data = data + self._name = name + + @property + def name(self): + return self._name or self.default_name + + +class PTXSource(LinkableCode): + """PTX Source code in memory""" + + kind = FILE_EXTENSION_MAP["ptx"] + default_name = "" + + +class CUSource(LinkableCode): + """CUDA C/C++ Source code in memory""" + + kind = "cu" + default_name = "" + + +class Fatbin(LinkableCode): + """A fatbin ELF in memory""" + + kind = FILE_EXTENSION_MAP["fatbin"] + default_name = "" + + +class Cubin(LinkableCode): + """A cubin ELF in memory""" + + kind = FILE_EXTENSION_MAP["cubin"] + default_name = "" + + +class Archive(LinkableCode): + """An archive of objects in memory""" + + kind = FILE_EXTENSION_MAP["a"] + default_name = "" + + +class Object(LinkableCode): + """An object file in memory""" + + kind = FILE_EXTENSION_MAP["o"] + default_name = "" + + +class LTOIR(LinkableCode): + """An LTOIR file in memory""" + + kind = "ltoir" + default_name = "" + + +class PatchedLinker(Linker): + def __init__( + self, + max_registers=None, + lineinfo=False, + cc=None, + lto=False, + additional_flags=None, + ): + try: + from pynvjitlink.api import NvJitLinker, NvJitLinkError + except ImportError as err: + raise ImportError(_MVC_ERROR_MESSAGE) from err + + if cc is None: + raise RuntimeError("PatchedLinker requires CC to be specified") + if not any(isinstance(cc, t) for t in [list, tuple]): + raise TypeError("`cc` must be a list or tuple of length 2") + + sm_ver = f"{cc[0] * 10 + cc[1]}" + arch = f"-arch=sm_{sm_ver}" + options = [arch] + if max_registers: + options.append(f"-maxrregcount={max_registers}") + if lineinfo: + options.append("-lineinfo") + if lto: + options.append("-lto") + if additional_flags is not None: + options.extend(additional_flags) + + self._linker = NvJitLinker(*options) + self.lto = lto + self.options = options + + @property + def info_log(self): + return self._linker.info_log + + @property + def error_log(self): + return self._linker.error_log + + def add_ptx(self, ptx, name=""): + self._linker.add_ptx(ptx, name) + + def add_fatbin(self, fatbin, name=""): + self._linker.add_fatbin(fatbin, name) + + def add_ltoir(self, ltoir, name=""): + self._linker.add_ltoir(ltoir, name) + + def add_object(self, obj, name=""): + self._linker.add_object(obj, name) + + def add_file_guess_ext(self, path_or_code): + # Numba's add_file_guess_ext expects to always be passed a path to a + # file that it will load from the filesystem to link. We augment it + # here with the ability to provide a file from memory. + + # To maintain compatibility with the original interface, all strings + # are treated as paths in the filesystem. + if isinstance(path_or_code, str): + # Upstream numba does not yet recognize LTOIR, so handle that + # separately here. + extension = pathlib.Path(path_or_code).suffix + if extension == ".ltoir": + self.add_file(path_or_code, "ltoir") + else: + # Use Numba's logic for non-LTOIR + super().add_file_guess_ext(path_or_code) + + return + + # Otherwise, we should have been given a LinkableCode object + if not isinstance(path_or_code, LinkableCode): + raise TypeError("Expected path to file or a LinkableCode object") + + if path_or_code.kind == "cu": + self.add_cu(path_or_code.data, path_or_code.name) + else: + self.add_data(path_or_code.data, path_or_code.kind, path_or_code.name) + + def add_file(self, path, kind): + try: + with open(path, "rb") as f: + data = f.read() + except FileNotFoundError: + raise LinkerError(f"{path} not found") + + name = pathlib.Path(path).name + self.add_data(data, kind, name) + + def add_data(self, data, kind, name): + if kind == FILE_EXTENSION_MAP["cubin"]: + fn = self._linker.add_cubin + elif kind == FILE_EXTENSION_MAP["fatbin"]: + fn = self._linker.add_fatbin + elif kind == FILE_EXTENSION_MAP["a"]: + fn = self._linker.add_library + elif kind == FILE_EXTENSION_MAP["ptx"]: + return self.add_ptx(data, name) + elif kind == FILE_EXTENSION_MAP["o"]: + fn = self._linker.add_object + elif kind == "ltoir": + fn = self._linker.add_ltoir + else: + raise LinkerError(f"Don't know how to link {kind}") + + try: + fn(data, name) + except NvJitLinkError as e: + raise LinkerError from e + + def add_cu(self, cu, name): + with driver.get_active_context() as ac: + dev = driver.get_device(ac.devnum) + cc = dev.compute_capability + + ptx, log = nvrtc.compile(cu, name, cc) + + if config.DUMP_ASSEMBLY: + print(("ASSEMBLY %s" % name).center(80, "-")) + print(ptx) + print("=" * 80) + + # Link the program's PTX using the normal linker mechanism + ptx_name = os.path.splitext(name)[0] + ".ptx" + self.add_ptx(ptx.encode(), ptx_name) + + def complete(self): + try: + cubin = self._linker.get_linked_cubin() + self._linker._complete = True + return cubin + except NvJitLinkError as e: + raise LinkerError from e # ----------------------------------------------------------------------------- From cbcbbab982b5cae4940959429752f67d1f74ad7d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 29 Jul 2024 07:07:17 -0700 Subject: [PATCH 04/52] pass through kwargs --- numba_cuda/numba/cuda/cudadrv/driver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 7ec704d..5205355 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -79,7 +79,6 @@ def make_logger(): logger.addHandler(logging.NullHandler()) return logger - class DeadMemoryError(RuntimeError): pass @@ -2594,13 +2593,13 @@ class Linker(metaclass=ABCMeta): """Abstract base class for linkers""" @classmethod - def new(cls, max_registers=0, lineinfo=False, cc=None): + def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs): if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: # TODO: circular from . import runtime driver_ver, runtime_ver = driver.get_version(), runtime.get_version() if driver_ver >= (12, 0) and runtime_ver > driver_ver: - return PatchedLinker(max_registers, lineinfo, cc) + return PatchedLinker(max_registers, lineinfo, cc, **kwargs) else: return MVCLinker(max_registers, lineinfo, cc) From 4406809eaf2c334ad4ce462d1d6b0426bae72c2d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 29 Jul 2024 07:08:31 -0700 Subject: [PATCH 05/52] patch_cuda once --- numba_cuda/numba/cuda/cudadrv/driver.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 5205355..cb1c89d 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2599,6 +2599,8 @@ def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs): from . import runtime driver_ver, runtime_ver = driver.get_version(), runtime.get_version() if driver_ver >= (12, 0) and runtime_ver > driver_ver: + # runs once + patch_cuda() return PatchedLinker(max_registers, lineinfo, cc, **kwargs) else: return MVCLinker(max_registers, lineinfo, cc) @@ -3020,6 +3022,18 @@ class LTOIR(LinkableCode): default_name = "" +@functools.lru_cache(maxsize=1) +def patch_cuda(): + from numba import cuda + cuda.Archive = Archive + cuda.CUSource = CUSource + cuda.Cubin = Cubin + cuda.Fatbin = Fatbin + cuda.Object = Object + cuda.PTXSource = PTXSource + cuda.LTOIR = LTOIR + + class PatchedLinker(Linker): def __init__( self, From dc887b63fbf58e8ded80e87a1bef1f87e3f8dff3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 30 Jul 2024 08:11:26 -0700 Subject: [PATCH 06/52] refactor --- numba_cuda/numba/cuda/cudadrv/driver.py | 105 +----------------- .../numba/cuda/cudadrv/linkable_code.py | 62 +++++++++++ numba_cuda/numba/cuda/cudadrv/mappings.py | 22 ++++ numba_cuda/numba/cuda/device_init.py | 3 + 4 files changed, 92 insertions(+), 100 deletions(-) create mode 100644 numba_cuda/numba/cuda/cudadrv/linkable_code.py create mode 100644 numba_cuda/numba/cuda/cudadrv/mappings.py diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index cb1c89d..5194bba 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -35,6 +35,7 @@ from .error import CudaSupportError, CudaDriverError from .drvapi import API_PROTOTYPES from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid +from .mappings import FILE_EXTENSION_MAP from numba.cuda.cudadrv import enums, drvapi, nvrtc, _extras USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING @@ -2568,40 +2569,18 @@ def launch_kernel(cufunc_handle, extra) -if USE_NV_BINDING: - jitty = binding.CUjitInputType - FILE_EXTENSION_MAP = { - 'o': jitty.CU_JIT_INPUT_OBJECT, - 'ptx': jitty.CU_JIT_INPUT_PTX, - 'a': jitty.CU_JIT_INPUT_LIBRARY, - 'lib': jitty.CU_JIT_INPUT_LIBRARY, - 'cubin': jitty.CU_JIT_INPUT_CUBIN, - 'fatbin': jitty.CU_JIT_INPUT_FATBINARY, - } -else: - FILE_EXTENSION_MAP = { - 'o': enums.CU_JIT_INPUT_OBJECT, - 'ptx': enums.CU_JIT_INPUT_PTX, - 'a': enums.CU_JIT_INPUT_LIBRARY, - 'lib': enums.CU_JIT_INPUT_LIBRARY, - 'cubin': enums.CU_JIT_INPUT_CUBIN, - 'fatbin': enums.CU_JIT_INPUT_FATBINARY, - } - - class Linker(metaclass=ABCMeta): """Abstract base class for linkers""" @classmethod - def new(cls, max_registers=0, lineinfo=False, cc=None, **kwargs): + def new(cls, max_registers=0, lineinfo=False, cc=None, lto=None, additional_flags=None): if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: # TODO: circular from . import runtime driver_ver, runtime_ver = driver.get_version(), runtime.get_version() if driver_ver >= (12, 0) and runtime_ver > driver_ver: # runs once - patch_cuda() - return PatchedLinker(max_registers, lineinfo, cc, **kwargs) + return PyNvJitLinker(max_registers, lineinfo, cc, lto, additional_flags) else: return MVCLinker(max_registers, lineinfo, cc) @@ -2960,81 +2939,7 @@ def complete(self): cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char)) return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,))) -class LinkableCode: - """An object that can be passed in the `link` list argument to `@cuda.jit` - kernels to supply code to be linked from memory.""" - - def __init__(self, data, name=None): - self.data = data - self._name = name - - @property - def name(self): - return self._name or self.default_name - - -class PTXSource(LinkableCode): - """PTX Source code in memory""" - - kind = FILE_EXTENSION_MAP["ptx"] - default_name = "" - - -class CUSource(LinkableCode): - """CUDA C/C++ Source code in memory""" - - kind = "cu" - default_name = "" - - -class Fatbin(LinkableCode): - """A fatbin ELF in memory""" - - kind = FILE_EXTENSION_MAP["fatbin"] - default_name = "" - - -class Cubin(LinkableCode): - """A cubin ELF in memory""" - - kind = FILE_EXTENSION_MAP["cubin"] - default_name = "" - - -class Archive(LinkableCode): - """An archive of objects in memory""" - - kind = FILE_EXTENSION_MAP["a"] - default_name = "" - - -class Object(LinkableCode): - """An object file in memory""" - - kind = FILE_EXTENSION_MAP["o"] - default_name = "" - - -class LTOIR(LinkableCode): - """An LTOIR file in memory""" - - kind = "ltoir" - default_name = "" - - -@functools.lru_cache(maxsize=1) -def patch_cuda(): - from numba import cuda - cuda.Archive = Archive - cuda.CUSource = CUSource - cuda.Cubin = Cubin - cuda.Fatbin = Fatbin - cuda.Object = Object - cuda.PTXSource = PTXSource - cuda.LTOIR = LTOIR - - -class PatchedLinker(Linker): +class PyNvJitLinker(Linker): def __init__( self, max_registers=None, @@ -3049,7 +2954,7 @@ def __init__( raise ImportError(_MVC_ERROR_MESSAGE) from err if cc is None: - raise RuntimeError("PatchedLinker requires CC to be specified") + raise RuntimeError("PyNvJitLinker requires CC to be specified") if not any(isinstance(cc, t) for t in [list, tuple]): raise TypeError("`cc` must be a list or tuple of length 2") diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py new file mode 100644 index 0000000..5280829 --- /dev/null +++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py @@ -0,0 +1,62 @@ +from .mappings import FILE_EXTENSION_MAP + +class LinkableCode: + """An object that can be passed in the `link` list argument to `@cuda.jit` + kernels to supply code to be linked from memory.""" + + def __init__(self, data, name=None): + self.data = data + self._name = name + + @property + def name(self): + return self._name or self.default_name + + +class PTXSource(LinkableCode): + """PTX Source code in memory""" + + kind = FILE_EXTENSION_MAP["ptx"] + default_name = "" + + +class CUSource(LinkableCode): + """CUDA C/C++ Source code in memory""" + + kind = "cu" + default_name = "" + + +class Fatbin(LinkableCode): + """A fatbin ELF in memory""" + + kind = FILE_EXTENSION_MAP["fatbin"] + default_name = "" + + +class Cubin(LinkableCode): + """A cubin ELF in memory""" + + kind = FILE_EXTENSION_MAP["cubin"] + default_name = "" + + +class Archive(LinkableCode): + """An archive of objects in memory""" + + kind = FILE_EXTENSION_MAP["a"] + default_name = "" + + +class Object(LinkableCode): + """An object file in memory""" + + kind = FILE_EXTENSION_MAP["o"] + default_name = "" + + +class LTOIR(LinkableCode): + """An LTOIR file in memory""" + + kind = "ltoir" + default_name = "" diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py new file mode 100644 index 0000000..3750324 --- /dev/null +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -0,0 +1,22 @@ +from numba import config +from cuda import cuda +from . import enums +if config.CUDA_USE_NVIDIA_BINDING: + jitty = cuda.CUjitInputType + FILE_EXTENSION_MAP = { + 'o': jitty.CU_JIT_INPUT_OBJECT, + 'ptx': jitty.CU_JIT_INPUT_PTX, + 'a': jitty.CU_JIT_INPUT_LIBRARY, + 'lib': jitty.CU_JIT_INPUT_LIBRARY, + 'cubin': jitty.CU_JIT_INPUT_CUBIN, + 'fatbin': jitty.CU_JIT_INPUT_FATBINARY, + } +else: + FILE_EXTENSION_MAP = { + 'o': enums.CU_JIT_INPUT_OBJECT, + 'ptx': enums.CU_JIT_INPUT_PTX, + 'a': enums.CU_JIT_INPUT_LIBRARY, + 'lib': enums.CU_JIT_INPUT_LIBRARY, + 'cubin': enums.CU_JIT_INPUT_CUBIN, + 'fatbin': enums.CU_JIT_INPUT_FATBINARY, + } diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py index 9df5ae9..e435290 100644 --- a/numba_cuda/numba/cuda/device_init.py +++ b/numba_cuda/numba/cuda/device_init.py @@ -31,6 +31,9 @@ shfl_xor_sync) from .kernels import reduction +from numba.cuda.cudadrv.linkable_code import ( + Archive, CUSource, Cubin, Fatbin, LinkableCode, LTOIR, Object, PTXSource +) reduce = Reduce = reduction.Reduce From b9898ec3a62f52602d66127cad68eec52a3cbd51 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 4 Aug 2024 10:48:52 -0700 Subject: [PATCH 07/52] merge latest/resolve conflicts --- .flake8 | 52 + .github/actions/compute-matrix/action.yaml | 7 - .github/workflows/build.yaml | 59 -- .github/workflows/conda-python-build.yaml | 125 +++ .github/workflows/conda-python-tests.yaml | 155 +++ .github/workflows/docs-build.yaml | 77 ++ .github/workflows/pr.yaml | 23 +- .github/workflows/publish.yaml | 44 + .github/workflows/wheels-build.yaml | 174 ++++ .github/workflows/wheels-test.yaml | 164 ++++ .pre-commit-config.yaml | 6 +- CMakeLists.txt | 18 - ci/build_conda.sh | 7 +- ci/build_docs.sh | 41 + ci/build_wheel.sh | 7 +- ci/test_conda.sh | 10 +- ci/test_wheel.sh | 8 +- ci/upload_conda.sh | 14 + .../{numba_cuda => numba-cuda}/meta.yaml | 10 +- numba_cuda/VERSION | 2 +- numba_cuda/numba/cuda/api.py | 3 +- numba_cuda/numba/cuda/cudadrv/_extras.c | 49 - numba_cuda/numba/cuda/cudadrv/devicearray.py | 3 +- numba_cuda/numba/cuda/cudadrv/driver.py | 31 +- numba_cuda/numba/cuda/cudadrv/drvapi.py | 12 +- numba_cuda/numba/cuda/cudadrv/dummyarray.py | 452 +++++++++ numba_cuda/numba/cuda/deviceufunc.py | 908 ++++++++++++++++++ numba_cuda/numba/cuda/runtime/nrt.cu | 107 +++ .../cuda/simulator/cudadrv/dummyarray.py | 4 + numba_cuda/numba/cuda/target.py | 1 + .../tests/cudapy/test_gufunc_scheduling.py | 2 +- .../numba/cuda/tests/cudapy/test_ipc.py | 8 +- .../cuda/tests/nocuda/test_dummyarray.py | 359 +++++++ numba_cuda/numba/cuda/vectorizers.py | 6 +- pyproject.toml | 32 +- setup.py | 70 ++ site-packages/_numba_cuda_redirector.py | 6 +- 37 files changed, 2832 insertions(+), 224 deletions(-) create mode 100644 .flake8 delete mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/conda-python-build.yaml create mode 100644 .github/workflows/conda-python-tests.yaml create mode 100644 .github/workflows/docs-build.yaml create mode 100644 .github/workflows/publish.yaml create mode 100644 .github/workflows/wheels-build.yaml create mode 100644 .github/workflows/wheels-test.yaml delete mode 100644 CMakeLists.txt create mode 100755 ci/build_docs.sh create mode 100755 ci/upload_conda.sh rename conda/recipes/{numba_cuda => numba-cuda}/meta.yaml (84%) delete mode 100644 numba_cuda/numba/cuda/cudadrv/_extras.c create mode 100644 numba_cuda/numba/cuda/cudadrv/dummyarray.py create mode 100644 numba_cuda/numba/cuda/deviceufunc.py create mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu create mode 100644 numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py create mode 100644 numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py create mode 100644 setup.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..591e4a2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,52 @@ +[flake8] +ignore = + # Extra space in brackets + E20, + # Multiple spaces around "," + E231,E241, + # Comments + E26, + # Assigning lambda expression + E731, + # Ambiguous variable names + E741, + # line break before binary operator + W503, + # line break after binary operator + W504, +max-line-length = 80 + +exclude = + __pycache__ + .git + *.pyc + *~ + *.o + *.so + *.cpp + *.c + *.h + +per-file-ignores = + # Slightly long line in the standard version file + numba_cuda/_version.py: E501 + # "Unused" imports / potentially undefined names in init files + numba_cuda/numba/cuda/__init__.py:F401,F403,F405 + numba_cuda/numba/cuda/simulator/__init__.py:F401,F403 + numba_cuda/numba/cuda/simulator/cudadrv/__init__.py:F401 + # Ignore star imports, unused imports, and "may be defined by star imports" + # errors in device_init because its purpose is to bring together a lot of + # the public API to be star-imported in numba.cuda.__init__ + numba_cuda/numba/cuda/device_init.py:F401,F403,F405 + # libdevice.py is an autogenerated file containing stubs for all the device + # functions. Some of the lines in docstrings are a little over-long, as they + # contain the URLs of the reference pages in the online libdevice + # documentation. + numba_cuda/numba/cuda/libdevice.py:E501 + # Ignore too-long lines in the doc examples, prioritising readability + # in the docs over line length in the example source (especially given that + # the test code is already indented by 8 spaces) + numba_cuda/numba/cuda/tests/doc_examples/test_random.py:E501 + numba_cuda/numba/cuda/tests/doc_examples/test_cg.py:E501 + numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py:E501 + numba_cuda/numba/tests/doc_examples/test_interval_example.py:E501 diff --git a/.github/actions/compute-matrix/action.yaml b/.github/actions/compute-matrix/action.yaml index dab011d..3dd73bc 100644 --- a/.github/actions/compute-matrix/action.yaml +++ b/.github/actions/compute-matrix/action.yaml @@ -16,14 +16,7 @@ runs: set -eo pipefail export BUILD_MATRIX=" - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'rockylinux8' } - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'rockylinux8' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'rockylinux8' } " export TEST_MATRIX=" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml deleted file mode 100644 index 00458da..0000000 --- a/.github/workflows/build.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -name: build - -on: - push: - tags: - - 'v*' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-release - cancel-in-progress: true - -jobs: - compute-matrix: - runs-on: ubuntu-latest - outputs: - BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }} - steps: - - uses: actions/checkout@v4 - - name: Compute Build Matrix - id: compute-matrix - uses: ./.github/actions/compute-matrix - build-wheels: - needs: - - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 - with: - build_type: branch - script: "ci/build_wheel.sh" - matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} - build-conda: - needs: - - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 - with: - build_type: branch - script: "ci/build_conda.sh" - matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} - publish-wheels: - needs: - - build-wheels - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04 - with: - build_type: ${{ inputs.build_type || 'branch' }} - branch: ${{ inputs.branch }} - sha: ${{ inputs.sha }} - date: ${{ inputs.date }} - package-name: numba_cuda - publish-conda: - needs: - - build-conda - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04 - with: - build_type: ${{ inputs.build_type || 'branch' }} - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml new file mode 100644 index 0000000..75e1028 --- /dev/null +++ b/.github/workflows/conda-python-build.yaml @@ -0,0 +1,125 @@ +on: + workflow_call: + inputs: + build_type: + required: true + type: string + branch: + type: string + date: + type: string + sha: + type: string + repo: + type: string + node_type: + type: string + default: "cpu8" + script: + type: string + default: "ci/build_python.sh" + matrix_filter: + type: string + default: "." + upload_to_anaconda: + type: boolean + required: false + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Compute Build Matrix + id: compute-matrix + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER] + # + export MATRIX=" + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04' } + " + + MATRIX="$( + yq -n -o json 'env(MATRIX)' | \ + jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + build: + needs: compute-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}" + env: + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + container: + image: rapidsai/ci-conda:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }} + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + - name: Standardize repository information + run: | + echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}" + echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}" + echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}" + echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}" + - name: Python build + run: ${{ inputs.script }} + env: + GH_TOKEN: ${{ github.token }} + - name: Upload conda repo + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: conda-repo + path: "/tmp/conda-bld-output" + - name: Publish conda package + if: inputs.upload_to_anaconda + run: "ci/upload_conda.sh" + env: + CONDA_TOKEN: ${{ secrets.NVIDIA_CONDA_TOKEN }} diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml new file mode 100644 index 0000000..ba9390e --- /dev/null +++ b/.github/workflows/conda-python-tests.yaml @@ -0,0 +1,155 @@ +on: + workflow_call: + inputs: + build_type: + required: true + type: string + branch: + type: string + date: + type: string + sha: + type: string + repo: + type: string + script: + type: string + default: "ci/test_python.sh" + run_codecov: + type: boolean + default: true + matrix_filter: + type: string + default: "." + container-options: + required: false + type: string + default: "-e _NOOP" + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + env: + BUILD_TYPE: ${{ inputs.build_type }} + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Validate Test Type + run: | + if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]]; then + echo "Invalid build type! Must be 'nightly' or 'pull-request'." + exit 1 + fi + - name: Compute Python Test Matrix + id: compute-matrix + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER] + # + export MATRICES=" + pull-request: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'earliest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' } + nightly: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.4.3', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'earliest' } + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.4.3', LINUX_VER: 'ubuntu20.04', GPU: 'v100', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8', GPU: 'v100', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', GPU: 'v100', DRIVER: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.4.3', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'latest' } + " + + TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(BUILD_TYPE)]') + export TEST_MATRIX + + MATRIX="$( + yq -n -o json 'env(TEST_MATRIX)' | \ + jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + tests: + needs: compute-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1" + env: + RAPIDS_COVERAGE_DIR: ${{ github.workspace }}/coverage-results + RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + container: + image: rapidsai/ci-conda:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }} + options: ${{ inputs.container-options }} + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + - uses: actions/download-artifact@v4 + name: conda-repo + - name: Display structure of downloaded files + run: ls -R + - name: Standardize repository information + run: | + echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}" + echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}" + echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}" + echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}" + - name: Python tests + run: ${{ inputs.script }} + env: + GH_TOKEN: ${{ github.token }} + - name: Generate test report + uses: test-summary/action@v2.3 + with: + paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml" + if: always() + - name: Run codecov + if: inputs.run_codecov + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + run: | + codecovcli \ + -v \ + upload-process \ + -C ${{ github.sha }} \ + -s "${RAPIDS_COVERAGE_DIR}" \ + --handle-no-reports-found + - name: Upload additional artifacts + if: "!cancelled()" + run: rapids-upload-artifacts-dir cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)_py${RAPIDS_PY_VERSION//.} diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml new file mode 100644 index 0000000..476e5b2 --- /dev/null +++ b/.github/workflows/docs-build.yaml @@ -0,0 +1,77 @@ +on: + workflow_call: + inputs: + build_type: + required: true + type: string + branch: + type: string + date: + type: string + sha: + type: string + repo: + type: string + node_type: + type: string + default: "cpu8" + script: + type: string + default: "ci/build_docs.sh" + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + build: + strategy: + fail-fast: false + runs-on: "linux-amd64-cpu4" + env: + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + container: + image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11 + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + - uses: actions/download-artifact@v4 + name: conda-repo + - name: Display structure of downloaded files + run: ls -R + - name: Standardize repository information + run: | + echo "RAPIDS_REPOSITORY=${{ inputs.repo || github.repository }}" >> "${GITHUB_ENV}" + echo "RAPIDS_SHA=$(git rev-parse HEAD)" >> "${GITHUB_ENV}" + echo "RAPIDS_REF_NAME=${{ inputs.branch || github.ref_name }}" >> "${GITHUB_ENV}" + echo "RAPIDS_NIGHTLY_DATE=${{ inputs.date }}" >> "${GITHUB_ENV}" + - name: Docs build + run: ${{ inputs.script }} + env: + GH_TOKEN: ${{ github.token }} + - name: Upload docs + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: docs + path: ${{ github.workspace }}/docs/build/html diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9adf087..7498eb1 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -1,4 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + name: pr on: @@ -19,11 +21,12 @@ jobs: - test-conda - build-wheels - test-wheels + - build-docs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08 with: enable_check_generated_files: false compute-matrix: @@ -39,7 +42,7 @@ jobs: build-conda: needs: - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 + uses: ./.github/workflows/conda-python-build.yaml with: build_type: pull-request script: "ci/build_conda.sh" @@ -48,7 +51,7 @@ jobs: needs: - build-conda - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + uses: ./.github/workflows/conda-python-tests.yaml with: build_type: pull-request script: "ci/test_conda.sh" @@ -57,7 +60,7 @@ jobs: build-wheels: needs: - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 + uses: ./.github/workflows/wheels-build.yaml with: build_type: pull-request script: "ci/build_wheel.sh" @@ -66,8 +69,14 @@ jobs: needs: - build-wheels - compute-matrix - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 + uses: ./.github/workflows/wheels-test.yaml with: build_type: pull-request script: "ci/test_wheel.sh" matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} + build-docs: + needs: + - build-conda + uses: ./.github/workflows/docs-build.yaml + with: + build_type: pull-request diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml new file mode 100644 index 0000000..986a693 --- /dev/null +++ b/.github/workflows/publish.yaml @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Publish packages + +on: + push: + tags: + - 'v*' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-release + cancel-in-progress: true + + +jobs: + compute-matrix: + runs-on: ubuntu-latest + outputs: + BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }} + TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }} + steps: + - uses: actions/checkout@v4 + - name: Compute Build Matrix + id: compute-matrix + uses: ./.github/actions/compute-matrix + build-conda: + needs: compute-matrix + uses: ./.github/workflows/conda-python-build.yaml + secrets: inherit + with: + build_type: release + script: "ci/build_conda.sh" + matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} + upload_to_anaconda: true + build-wheels: + needs: compute-matrix + uses: ./.github/workflows/wheels-build.yaml + secrets: inherit + with: + build_type: release + script: "ci/build_wheel.sh" + matrix_filter: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} + upload_to_pypi: true diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml new file mode 100644 index 0000000..c1d20ec --- /dev/null +++ b/.github/workflows/wheels-build.yaml @@ -0,0 +1,174 @@ +name: Build RAPIDS wheels + +on: + workflow_call: + inputs: + # repo and branch + repo: + type: string + branch: + type: string + date: + type: string + sha: + type: string + build_type: + required: true + type: string + script: + required: true + type: string + + # allow a bigger runner instance + node_type: + required: false + type: string + default: "cpu8" + + # general settings + matrix_filter: + type: string + default: "." + + # Extra repository that will be cloned into the project directory. + extra-repo: + required: false + type: string + default: '' + extra-repo-sha: + required: false + type: string + default: '' + # Note that this is the _name_ of a secret containing the key, not the key itself. + extra-repo-deploy-key: + required: false + type: string + default: '' + + upload_to_pypi: + type: boolean + required: false + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + + compute-matrix: + runs-on: ubuntu-latest + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Compute Build Matrix + id: compute-matrix + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER] + # + export MATRIX=" + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'rockylinux8' } + " + + MATRIX="$( + yq -n -o json 'env(MATRIX)' | \ + jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + build: + name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }} + needs: [compute-matrix] + strategy: + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}" + env: + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + container: + image: "rapidsai/ci-wheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}" + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + + steps: + - name: checkout code repo + uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 # unshallow fetch for setuptools-scm + persist-credentials: false + + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + with: + repo: ${{ inputs.repo }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + + - name: Preprocess extra repos + id: preprocess-extras + if: ${{ inputs.extra-repo != '' }} + run: | + EXTRA_REPO_PATH=$(echo ${{ inputs.extra-repo }} | cut -d "/" -f 2) + echo "EXTRA_REPO_PATH=${EXTRA_REPO_PATH}" >> $GITHUB_OUTPUT + + - name: checkout extra repos + uses: actions/checkout@v4 + if: ${{ inputs.extra-repo != '' }} + with: + repository: ${{ inputs.extra-repo }} + ref: ${{ inputs.extra-repo-sha }} + path: "./${{ steps.preprocess-extras.outputs.EXTRA_REPO_PATH }}" + ssh-key: ${{ secrets[inputs.extra-repo-deploy-key] }} + persist-credentials: false + + - name: Build and repair the wheel + run: | + ${{ inputs.script }} + env: + GH_TOKEN: ${{ github.token }} + # Use a shell that loads the rc file so that we get the compiler settings + shell: bash -leo pipefail {0} + - name: Upload wheel + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: wheel + path: ${{ env.package_path }} + - name: Publish wheel + if: inputs.upload_to_pypi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml new file mode 100644 index 0000000..10acf54 --- /dev/null +++ b/.github/workflows/wheels-test.yaml @@ -0,0 +1,164 @@ +name: Test RAPIDS wheels + +on: + workflow_call: + inputs: + # repo and branch + repo: + type: string + branch: + type: string + date: + type: string + sha: + type: string + build_type: + required: true + type: string + script: + type: string + default: "ci/test_wheel.sh" + matrix_filter: + type: string + default: "." + container-options: + required: false + type: string + default: "-e _NOOP" + test_summary_show: + required: false + type: string + default: "fail" + # the use of secrets in shared-workflows is discouraged, especially for public repositories. + # these values were added for situations where the use of secrets is unavoidable. + secrets: + RAPIDS_AUX_SECRET_1: + required: false + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + env: + BUILD_TYPE: ${{ inputs.build_type }} + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Validate test type + run: | + if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]]; then + echo "Invalid build type! Must be 'nightly' or 'pull-request'." + exit 1 + fi + - name: Compute test matrix + id: compute-matrix + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER] + # + export MATRICES=" + pull-request: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + nightly: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'v100', driver: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'v100', driver: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.5.1', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' } + " + + TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(BUILD_TYPE)]') + export TEST_MATRIX + + MATRIX="$( + yq -n -o json 'env(TEST_MATRIX)' | \ + jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + test: + name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}, ${{ matrix.gpu }} + needs: compute-matrix + env: + RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.gpu }}-${{ matrix.driver }}-1" + container: + image: "rapidsai/citestwheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}" + options: ${{ inputs.container-options }} + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} # GPU jobs must set this container env variable + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - name: Run nvidia-smi to make sure GPU is working + run: nvidia-smi + + - name: checkout code repo + uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 # unshallow fetch for setuptools-scm + persist-credentials: false + - uses: actions/download-artifact@v4 + name: wheel + - name: Display structure of downloaded files + run: ls -R + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + with: + repo: ${{ inputs.repo }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + + - name: Run tests + run: ${{ inputs.script }} + env: + GH_TOKEN: ${{ github.token }} + RAPIDS_AUX_SECRET_1: ${{ secrets.RAPIDS_AUX_SECRET_1 }} + + - name: Generate test report + uses: test-summary/action@v2.3 + with: + paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml" + show: ${{ inputs.test_summary_show }} + if: always() + + - name: Upload additional artifacts + if: "!cancelled()" + run: rapids-upload-artifacts-dir cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)_py${RAPIDS_PY_VERSION//.} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4e6a92c..0a114cd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1 +1,5 @@ -repos: [] +repos: +- repo: https://github.com/PyCQA/flake8 + rev: 7.1.0 + hooks: + - id: flake8 diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index acd6ac1..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) - -project( - numba_cuda - VERSION ${SKBUILD_PROJECT_VERSION} - LANGUAGES C -) - -find_package(Python COMPONENTS Interpreter Development REQUIRED) - -Python_add_library(_extras MODULE numba_cuda/numba/cuda/cudadrv/_extras.c WITH_SOABI) - -target_compile_options(_extras PRIVATE -Werror -Wall) - -install(TARGETS _extras LIBRARY DESTINATION numba_cuda/numba/cuda/cudadrv) -install(FILES site-packages/_numba_cuda_redirector.py - site-packages/_numba_cuda_redirector.pth - DESTINATION ${SKBUILD_PLATLIB_DIR}) diff --git a/ci/build_conda.sh b/ci/build_conda.sh index 6c170eb..c4b86be 100755 --- a/ci/build_conda.sh +++ b/ci/build_conda.sh @@ -9,12 +9,11 @@ source rapids-configure-sccache source rapids-date-string -export CMAKE_GENERATOR=Ninja - rapids-print-env rapids-logger "Begin py build" -rapids-conda-retry mambabuild conda/recipes/numba_cuda +rapids-conda-retry mambabuild conda/recipes/numba-cuda -rapids-upload-conda-to-s3 python +package_path=(/tmp/conda-bld-output/noarch/numba-cuda-*.tar.bz2) +echo "package_path=$package_path" >> $GITHUB_ENV diff --git a/ci/build_docs.sh b/ci/build_docs.sh new file mode 100755 index 0000000..9742c8b --- /dev/null +++ b/ci/build_docs.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +rapids-logger "Install docs dependencies" +# TODO: Replace with rapids-dependency-file-generator +rapids-mamba-retry create -n docs \ + make \ + psutil \ + sphinx \ + sphinx_rtd_theme \ + numpydoc \ + python=${RAPIDS_PY_VERSION} + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate docs +set -u + +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda + +rapids-print-env + +rapids-logger "Show Numba system info" +python -m numba --sysinfo + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +rapids-logger "Build docs" +pushd docs +make html + +popd + +rapids-logger "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 688431e..34930f1 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -7,7 +7,6 @@ rapids-logger "Build wheel" mkdir -p ./dist python -m pip wheel . --wheel-dir=./dist -vvv --disable-pip-version-check --no-deps -python -m auditwheel repair -w ./final_dist ./dist/* - -rapids-logger "Upload Wheel" -RAPIDS_PY_WHEEL_NAME="numba_cuda" rapids-upload-wheels-to-s3 ./final_dist +package_path=$(realpath ./dist/numba_cuda-*.whl) +echo "Package path: $package_path" +echo "package_path=$package_path" >> $GITHUB_ENV diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 702cbcd..5861128 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -10,11 +10,10 @@ rapids-logger "Install testing dependencies" rapids-mamba-retry create -n test \ c-compiler \ cxx-compiler \ - cuda-nvcc \ + cuda-nvcc-impl \ cuda-nvrtc \ cuda-python \ cuda-version=${RAPIDS_CUDA_VERSION%.*} \ - "numba>=0.59.1" \ make \ psutil \ pytest \ @@ -25,17 +24,14 @@ set +u conda activate test set -u -PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda + RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" pushd "${RAPIDS_TESTS_DIR}" rapids-print-env -rapids-mamba-retry install \ - --channel "${PYTHON_CHANNEL}" \ - numba_cuda - rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 149455a..5ffd014 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -6,16 +6,14 @@ set -euo pipefail rapids-logger "Install testing dependencies" # TODO: Replace with rapids-dependency-file-generator python -m pip install \ - "numba>=0.59.1" \ psutil \ cuda-python \ pytest -rapids-logger "Download Wheel" -RAPIDS_PY_WHEEL_NAME="numba_cuda" rapids-download-wheels-from-s3 ./dist/ - rapids-logger "Install wheel" -python -m pip install $(echo ./dist/numba_cuda*.whl) +package=$(realpath wheel/numba_cuda*.whl) +echo "Package path: $package" +python -m pip install $package rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/upload_conda.sh b/ci/upload_conda.sh new file mode 100755 index 0000000..1ac266e --- /dev/null +++ b/ci/upload_conda.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +PKGS_TO_UPLOAD=$(rapids-find-anaconda-uploads.py $RAPIDS_CONDA_BLD_OUTPUT_DIR) + +rapids-retry anaconda \ + -t $CONDA_TOKEN \ + upload \ + --skip-existing \ + --no-progress \ + ${PKGS_TO_UPLOAD} diff --git a/conda/recipes/numba_cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml similarity index 84% rename from conda/recipes/numba_cuda/meta.yaml rename to conda/recipes/numba-cuda/meta.yaml index d510fae..a74ae34 100644 --- a/conda/recipes/numba_cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -9,26 +9,22 @@ {% set project_urls = project_data["urls"] %} package: - name: numba_cuda + name: numba-cuda version: {{ version }} source: path: ../../.. build: + noarch: python script: - {{ PYTHON }} -m pip install . -vv requirements: - build: - - {{ compiler('c') }} - - cmake >=3.26.4 - - ninja - - sysroot_{{ target_platform }} 2.17 host: - python - pip - - scikit-build-core + - setuptools run: - python - numba >=0.59.1 diff --git a/numba_cuda/VERSION b/numba_cuda/VERSION index 8acdd82..43b2961 100644 --- a/numba_cuda/VERSION +++ b/numba_cuda/VERSION @@ -1 +1 @@ -0.0.1 +0.0.13 diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index 4384749..5dfe7c4 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -239,7 +239,8 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0): driver_handle = driver.binding.CUipcMemHandle() driver_handle.reserved = handle else: - driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle) + driver_handle = driver.drvapi.cu_ipc_mem_handle() + driver_handle.reserved[:] = handle # use *IpcHandle* to open the IPC memory ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset) yield ipchandle.open_array(current_context(), shape=shape, diff --git a/numba_cuda/numba/cuda/cudadrv/_extras.c b/numba_cuda/numba/cuda/cudadrv/_extras.c deleted file mode 100644 index 8652083..0000000 --- a/numba_cuda/numba/cuda/cudadrv/_extras.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Helper binding to call some CUDA Runtime API that cannot be directly - * encoded using ctypes. - */ - -#include "Python.h" - -#define CUDA_IPC_HANDLE_SIZE 64 - -typedef int CUresult; -typedef void* CUdeviceptr; - -typedef struct CUipcMemHandle_st{ - char reserved[CUDA_IPC_HANDLE_SIZE]; -} CUipcMemHandle; - -typedef CUresult (*cuIpcOpenMemHandle_t)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int flags ); - -static -cuIpcOpenMemHandle_t cuIpcOpenMemHandle = 0; - -static -void set_cuIpcOpenMemHandle(void* fnptr) -{ - cuIpcOpenMemHandle = (cuIpcOpenMemHandle_t)fnptr; -} - -static -CUresult call_cuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle* handle, unsigned int flags) -{ - return cuIpcOpenMemHandle(pdptr, *handle, flags); -} - - -PyMODINIT_FUNC PyInit__extras(void) { - static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, "_extras", "No docs", -1, NULL, NULL, NULL, NULL, NULL - }; - - PyObject *m = PyModule_Create(&moduledef); - - if (m == NULL) - return NULL; - - PyModule_AddObject(m, "set_cuIpcOpenMemHandle", PyLong_FromVoidPtr(&set_cuIpcOpenMemHandle)); - PyModule_AddObject(m, "call_cuIpcOpenMemHandle", PyLong_FromVoidPtr(&call_cuIpcOpenMemHandle)); - PyModule_AddIntConstant(m, "CUDA_IPC_HANDLE_SIZE", CUDA_IPC_HANDLE_SIZE); - return m; -} diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 5d9c4af..90f4070 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -14,12 +14,11 @@ import numba from numba import _devicearray -from numba.cuda.cudadrv import devices +from numba.cuda.cudadrv import devices, dummyarray from numba.cuda.cudadrv import driver as _driver from numba.core import types, config from numba.np.unsafe.ndarray import to_fixed_tuple from numba.np.numpy_support import numpy_version -from numba.misc import dummyarray from numba.np import numpy_support from numba.cuda.api_util import prepare_shape_strides_dtype from numba.core.errors import NumbaPerformanceWarning diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 5194bba..d9d1ded 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -36,7 +36,7 @@ from .drvapi import API_PROTOTYPES from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid from .mappings import FILE_EXTENSION_MAP -from numba.cuda.cudadrv import enums, drvapi, nvrtc, _extras +from numba.cuda.cudadrv import enums, drvapi, nvrtc USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING @@ -259,29 +259,6 @@ def ensure_initialized(self): else: self.pid = _getpid() - self._initialize_extras() - - def _initialize_extras(self): - if USE_NV_BINDING: - # The extras are only needed when using Numba's ctypes bindings - return - - # set pointer to original cuIpcOpenMemHandle - set_proto = ctypes.CFUNCTYPE(None, c_void_p) - set_cuIpcOpenMemHandle = set_proto(_extras.set_cuIpcOpenMemHandle) - set_cuIpcOpenMemHandle(self._find_api('cuIpcOpenMemHandle')) - # bind caller to cuIpcOpenMemHandle that fixes the ABI - call_proto = ctypes.CFUNCTYPE(c_int, - ctypes.POINTER(drvapi.cu_device_ptr), - ctypes.POINTER(drvapi.cu_ipc_mem_handle), - ctypes.c_uint) - call_cuIpcOpenMemHandle = call_proto(_extras.call_cuIpcOpenMemHandle) - call_cuIpcOpenMemHandle.__name__ = 'call_cuIpcOpenMemHandle' - safe_call = self._ctypes_wrap_fn('call_cuIpcOpenMemHandle', - call_cuIpcOpenMemHandle) - # override cuIpcOpenMemHandle - self.cuIpcOpenMemHandle = safe_call - @property def is_available(self): self.ensure_initialized() @@ -1878,7 +1855,7 @@ def __reduce__(self): if USE_NV_BINDING: preprocessed_handle = self.handle.reserved else: - preprocessed_handle = tuple(self.handle) + preprocessed_handle = tuple(self.handle.reserved) args = ( self.__class__, preprocessed_handle, @@ -1892,9 +1869,9 @@ def __reduce__(self): def _rebuild(cls, handle_ary, size, source_info, offset): if USE_NV_BINDING: handle = binding.CUipcMemHandle() - handle.reserved = handle_ary else: - handle = drvapi.cu_ipc_mem_handle(*handle_ary) + handle = drvapi.cu_ipc_mem_handle() + handle.reserved = handle_ary return cls(base=None, handle=handle, size=size, source_info=source_info, offset=offset) diff --git a/numba_cuda/numba/cuda/cudadrv/drvapi.py b/numba_cuda/numba/cuda/cudadrv/drvapi.py index cbbd792..7f6dfbb 100644 --- a/numba_cuda/numba/cuda/cudadrv/drvapi.py +++ b/numba_cuda/numba/cuda/cudadrv/drvapi.py @@ -1,7 +1,6 @@ from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint, - c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER) - -from numba.cuda.cudadrv import _extras + c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER, + Structure) cu_device = c_int cu_device_attribute = c_int # enum @@ -15,13 +14,18 @@ cu_event = c_void_p cu_link_state = c_void_p cu_function_attribute = c_int -cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide cu_uuid = (c_byte * 16) # Device UUID cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object) cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int) + +# Mirrors the definition of CUipcMemHandle in cuda.h +class cu_ipc_mem_handle(Structure): + _fields_ = [("reserved", c_uint8 * 64)] + + # See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html CU_STREAM_DEFAULT = 0 CU_STREAM_LEGACY = 1 diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py new file mode 100644 index 0000000..38e1b89 --- /dev/null +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -0,0 +1,452 @@ +from collections import namedtuple +import itertools +import functools +import operator +import ctypes + +import numpy as np + +from numba import _helperlib + +Extent = namedtuple("Extent", ["begin", "end"]) + +attempt_nocopy_reshape = ctypes.CFUNCTYPE( + ctypes.c_int, + ctypes.c_long, # nd + np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # dims + np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # strides + ctypes.c_long, # newnd + np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newdims + np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides + ctypes.c_long, # itemsize + ctypes.c_int, # is_f_order +)(_helperlib.c_helpers['attempt_nocopy_reshape']) + + +class Dim(object): + """A single dimension of the array + + Attributes + ---------- + start: + start offset + stop: + stop offset + size: + number of items + stride: + item stride + """ + __slots__ = 'start', 'stop', 'size', 'stride', 'single' + + def __init__(self, start, stop, size, stride, single): + self.start = start + self.stop = stop + self.size = size + self.stride = stride + self.single = single + assert not single or size == 1 + + def __getitem__(self, item): + if isinstance(item, slice): + start, stop, step = item.indices(self.size) + stride = step * self.stride + start = self.start + start * abs(self.stride) + stop = self.start + stop * abs(self.stride) + if stride == 0: + size = 1 + else: + size = _compute_size(start, stop, stride) + ret = Dim( + start=start, + stop=stop, + size=size, + stride=stride, + single=False + ) + return ret + else: + sliced = self[item:item + 1] if item != -1 else self[-1:] + if sliced.size != 1: + raise IndexError + return Dim( + start=sliced.start, + stop=sliced.stop, + size=sliced.size, + stride=sliced.stride, + single=True, + ) + + def get_offset(self, idx): + return self.start + idx * self.stride + + def __repr__(self): + strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)" + return strfmt % (self.start, self.stop, self.size, self.stride) + + def normalize(self, base): + return Dim(start=self.start - base, stop=self.stop - base, + size=self.size, stride=self.stride, single=self.single) + + def copy(self, start=None, stop=None, size=None, stride=None, single=None): + if start is None: + start = self.start + if stop is None: + stop = self.stop + if size is None: + size = self.size + if stride is None: + stride = self.stride + if single is None: + single = self.single + return Dim(start, stop, size, stride, single) + + def is_contiguous(self, itemsize): + return self.stride == itemsize + + +def compute_index(indices, dims): + return sum(d.get_offset(i) for i, d in zip(indices, dims)) + + +class Element(object): + is_array = False + + def __init__(self, extent): + self.extent = extent + + def iter_contiguous_extent(self): + yield self.extent + + +class Array(object): + """A dummy numpy array-like object. Consider it an array without the + actual data, but offset from the base data pointer. + + Attributes + ---------- + dims: tuple of Dim + describing each dimension of the array + + ndim: int + number of dimension + + shape: tuple of int + size of each dimension + + strides: tuple of int + stride of each dimension + + itemsize: int + itemsize + + extent: (start, end) + start and end offset containing the memory region + """ + is_array = True + + @classmethod + def from_desc(cls, offset, shape, strides, itemsize): + dims = [] + for ashape, astride in zip(shape, strides): + dim = Dim(offset, offset + ashape * astride, ashape, astride, + single=False) + dims.append(dim) + offset = 0 # offset only applies to first dimension + return cls(dims, itemsize) + + def __init__(self, dims, itemsize): + self.dims = tuple(dims) + self.ndim = len(self.dims) + self.shape = tuple(dim.size for dim in self.dims) + self.strides = tuple(dim.stride for dim in self.dims) + self.itemsize = itemsize + self.size = functools.reduce(operator.mul, self.shape, 1) + self.extent = self._compute_extent() + self.flags = self._compute_layout() + + def _compute_layout(self): + # The logic here is based on that in _UpdateContiguousFlags from + # numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit + # 13661ac70). + # https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191 + + # Records have no dims, and we can treat them as contiguous + if not self.dims: + return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + + # If this is a broadcast array then it is not contiguous + if any([dim.stride == 0 for dim in self.dims]): + return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False} + + flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + + # Check C contiguity + sd = self.itemsize + for dim in reversed(self.dims): + if dim.size == 0: + # Contiguous by definition + return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + if dim.size != 1: + if dim.stride != sd: + flags['C_CONTIGUOUS'] = False + sd *= dim.size + + # Check F contiguity + sd = self.itemsize + for dim in self.dims: + if dim.size != 1: + if dim.stride != sd: + flags['F_CONTIGUOUS'] = False + return flags + sd *= dim.size + + return flags + + def _compute_extent(self): + firstidx = [0] * self.ndim + lastidx = [s - 1 for s in self.shape] + start = compute_index(firstidx, self.dims) + stop = compute_index(lastidx, self.dims) + self.itemsize + stop = max(stop, start) # ensure positive extent + return Extent(start, stop) + + def __repr__(self): + return '' % (self.dims, self.itemsize) + + def __getitem__(self, item): + if not isinstance(item, tuple): + item = [item] + else: + item = list(item) + + nitem = len(item) + ndim = len(self.dims) + if nitem > ndim: + raise IndexError("%d extra indices given" % (nitem - ndim,)) + + # Add empty slices for missing indices + while len(item) < ndim: + item.append(slice(None, None)) + + dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)] + newshape = [d.size for d in dims if not d.single] + + arr = Array(dims, self.itemsize) + if newshape: + return arr.reshape(*newshape)[0] + else: + return Element(arr.extent) + + @property + def is_c_contig(self): + return self.flags['C_CONTIGUOUS'] + + @property + def is_f_contig(self): + return self.flags['F_CONTIGUOUS'] + + def iter_contiguous_extent(self): + """ Generates extents + """ + if self.is_c_contig or self.is_f_contig: + yield self.extent + else: + if self.dims[0].stride < self.dims[-1].stride: + innerdim = self.dims[0] + outerdims = self.dims[1:] + outershape = self.shape[1:] + else: + innerdim = self.dims[-1] + outerdims = self.dims[:-1] + outershape = self.shape[:-1] + + if innerdim.is_contiguous(self.itemsize): + oslen = [range(s) for s in outershape] + for indices in itertools.product(*oslen): + base = compute_index(indices, outerdims) + yield base + innerdim.start, base + innerdim.stop + else: + oslen = [range(s) for s in self.shape] + for indices in itertools.product(*oslen): + offset = compute_index(indices, self.dims) + yield offset, offset + self.itemsize + + def reshape(self, *newdims, **kws): + oldnd = self.ndim + newnd = len(newdims) + + if newdims == self.shape: + return self, None + + order = kws.pop('order', 'C') + if kws: + raise TypeError('unknown keyword arguments %s' % kws.keys()) + if order not in 'CFA': + raise ValueError('order not C|F|A') + + # check for exactly one instance of -1 in newdims + # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501 + unknownidx = -1 + knownsize = 1 + for i, dim in enumerate(newdims): + if dim < 0: + if unknownidx == -1: + unknownidx = i + else: + raise ValueError("can only specify one unknown dimension") + else: + knownsize *= dim + + # compute the missing dimension + if unknownidx >= 0: + if knownsize == 0 or self.size % knownsize != 0: + raise ValueError("cannot infer valid shape " + "for unknown dimension") + else: + newdims = newdims[0:unknownidx] \ + + (self.size // knownsize,) \ + + newdims[unknownidx + 1:] + + newsize = functools.reduce(operator.mul, newdims, 1) + + if order == 'A': + order = 'F' if self.is_f_contig else 'C' + + if newsize != self.size: + raise ValueError("reshape changes the size of the array") + + if self.is_c_contig or self.is_f_contig: + if order == 'C': + newstrides = list(iter_strides_c_contig(self, newdims)) + elif order == 'F': + newstrides = list(iter_strides_f_contig(self, newdims)) + else: + raise AssertionError("unreachable") + else: + newstrides = np.empty(newnd, np.ctypeslib.c_intp) + + # need to keep these around in variables, not temporaries, so they + # don't get GC'ed before we call into the C code + olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp) + oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp) + newdims = np.array(newdims, dtype=np.ctypeslib.c_intp) + + if not attempt_nocopy_reshape( + oldnd, + olddims, + oldstrides, + newnd, + newdims, + newstrides, + self.itemsize, + order == 'F', + ): + raise NotImplementedError('reshape would require copy') + + ret = self.from_desc(self.extent.begin, shape=newdims, + strides=newstrides, itemsize=self.itemsize) + + return ret, list(self.iter_contiguous_extent()) + + def squeeze(self, axis=None): + newshape, newstrides = [], [] + if axis is None: + for length, stride in zip(self.shape, self.strides): + if length != 1: + newshape.append(length) + newstrides.append(stride) + else: + if not isinstance(axis, tuple): + axis = (axis,) + for ax in axis: + if self.shape[ax] != 1: + raise ValueError( + "cannot select an axis to squeeze out which has size " + "not equal to one" + ) + for i, (length, stride) in enumerate(zip(self.shape, self.strides)): + if i not in axis: + newshape.append(length) + newstrides.append(stride) + newarr = self.from_desc( + self.extent.begin, + shape=newshape, + strides=newstrides, + itemsize=self.itemsize, + ) + return newarr, list(self.iter_contiguous_extent()) + + def ravel(self, order='C'): + if order not in 'CFA': + raise ValueError('order not C|F|A') + + if (order in 'CA' and self.is_c_contig + or order in 'FA' and self.is_f_contig): + newshape = (self.size,) + newstrides = (self.itemsize,) + arr = self.from_desc(self.extent.begin, newshape, newstrides, + self.itemsize) + return arr, list(self.iter_contiguous_extent()) + + else: + raise NotImplementedError("ravel on non-contiguous array") + + +def iter_strides_f_contig(arr, shape=None): + """yields the f-contiguous strides + """ + shape = arr.shape if shape is None else shape + itemsize = arr.itemsize + yield itemsize + sum = 1 + for s in shape[:-1]: + sum *= s + yield sum * itemsize + + +def iter_strides_c_contig(arr, shape=None): + """yields the c-contiguous strides + """ + shape = arr.shape if shape is None else shape + itemsize = arr.itemsize + + def gen(): + yield itemsize + sum = 1 + for s in reversed(shape[1:]): + sum *= s + yield sum * itemsize + + for i in reversed(list(gen())): + yield i + + +def is_element_indexing(item, ndim): + if isinstance(item, slice): + return False + + elif isinstance(item, tuple): + if len(item) == ndim: + if not any(isinstance(it, slice) for it in item): + return True + + else: + return True + + return False + + +def _compute_size(start, stop, step): + """Algorithm adapted from cpython rangeobject.c + """ + if step > 0: + lo = start + hi = stop + else: + lo = stop + hi = start + step = -step + if lo >= hi: + return 0 + return (hi - lo - 1) // step + 1 diff --git a/numba_cuda/numba/cuda/deviceufunc.py b/numba_cuda/numba/cuda/deviceufunc.py new file mode 100644 index 0000000..c29335a --- /dev/null +++ b/numba_cuda/numba/cuda/deviceufunc.py @@ -0,0 +1,908 @@ +""" +Implements custom ufunc dispatch mechanism for non-CPU devices. +""" + +from abc import ABCMeta, abstractmethod +from collections import OrderedDict +import operator +import warnings +from functools import reduce + +import numpy as np + +from numba.np.ufunc.ufuncbuilder import _BaseUFuncBuilder, parse_identity +from numba.core import types, sigutils +from numba.core.typing import signature +from numba.np.ufunc.sigparse import parse_signature + + +def _broadcast_axis(a, b): + """ + Raises + ------ + ValueError if broadcast fails + """ + if a == b: + return a + elif a == 1: + return b + elif b == 1: + return a + else: + raise ValueError("failed to broadcast {0} and {1}".format(a, b)) + + +def _pairwise_broadcast(shape1, shape2): + """ + Raises + ------ + ValueError if broadcast fails + """ + shape1, shape2 = map(tuple, [shape1, shape2]) + + while len(shape1) < len(shape2): + shape1 = (1,) + shape1 + + while len(shape1) > len(shape2): + shape2 = (1,) + shape2 + + return tuple(_broadcast_axis(a, b) for a, b in zip(shape1, shape2)) + + +def _multi_broadcast(*shapelist): + """ + Raises + ------ + ValueError if broadcast fails + """ + assert shapelist + + result = shapelist[0] + others = shapelist[1:] + try: + for i, each in enumerate(others, start=1): + result = _pairwise_broadcast(result, each) + except ValueError: + raise ValueError("failed to broadcast argument #{0}".format(i)) + else: + return result + + +class UFuncMechanism(object): + """ + Prepare ufunc arguments for vectorize. + """ + DEFAULT_STREAM = None + SUPPORT_DEVICE_SLICING = False + + def __init__(self, typemap, args): + """Never used directly by user. Invoke by UFuncMechanism.call(). + """ + self.typemap = typemap + self.args = args + nargs = len(self.args) + self.argtypes = [None] * nargs + self.scalarpos = [] + self.signature = None + self.arrays = [None] * nargs + + def _fill_arrays(self): + """ + Get all arguments in array form + """ + for i, arg in enumerate(self.args): + if self.is_device_array(arg): + self.arrays[i] = self.as_device_array(arg) + elif isinstance(arg, (int, float, complex, np.number)): + # Is scalar + self.scalarpos.append(i) + else: + self.arrays[i] = np.asarray(arg) + + def _fill_argtypes(self): + """ + Get dtypes + """ + for i, ary in enumerate(self.arrays): + if ary is not None: + dtype = getattr(ary, 'dtype') + if dtype is None: + dtype = np.asarray(ary).dtype + self.argtypes[i] = dtype + + def _resolve_signature(self): + """Resolve signature. + May have ambiguous case. + """ + matches = [] + # Resolve scalar args exact match first + if self.scalarpos: + # Try resolve scalar arguments + for formaltys in self.typemap: + match_map = [] + for i, (formal, actual) in enumerate(zip(formaltys, + self.argtypes)): + if actual is None: + actual = np.asarray(self.args[i]).dtype + + match_map.append(actual == formal) + + if all(match_map): + matches.append(formaltys) + + # No matching with exact match; try coercing the scalar arguments + if not matches: + matches = [] + for formaltys in self.typemap: + all_matches = all(actual is None or formal == actual + for formal, actual in + zip(formaltys, self.argtypes)) + if all_matches: + matches.append(formaltys) + + if not matches: + raise TypeError("No matching version. GPU ufunc requires array " + "arguments to have the exact types. This behaves " + "like regular ufunc with casting='no'.") + + if len(matches) > 1: + raise TypeError("Failed to resolve ufunc due to ambiguous " + "signature. Too many untyped scalars. " + "Use numpy dtype object to type tag.") + + # Try scalar arguments + self.argtypes = matches[0] + + def _get_actual_args(self): + """Return the actual arguments + Casts scalar arguments to np.array. + """ + for i in self.scalarpos: + self.arrays[i] = np.array([self.args[i]], dtype=self.argtypes[i]) + + return self.arrays + + def _broadcast(self, arys): + """Perform numpy ufunc broadcasting + """ + shapelist = [a.shape for a in arys] + shape = _multi_broadcast(*shapelist) + + for i, ary in enumerate(arys): + if ary.shape == shape: + pass + + else: + if self.is_device_array(ary): + arys[i] = self.broadcast_device(ary, shape) + + else: + ax_differs = [ax for ax in range(len(shape)) + if ax >= ary.ndim + or ary.shape[ax] != shape[ax]] + + missingdim = len(shape) - len(ary.shape) + strides = [0] * missingdim + list(ary.strides) + + for ax in ax_differs: + strides[ax] = 0 + + strided = np.lib.stride_tricks.as_strided(ary, + shape=shape, + strides=strides) + + arys[i] = self.force_array_layout(strided) + + return arys + + def get_arguments(self): + """Prepare and return the arguments for the ufunc. + Does not call to_device(). + """ + self._fill_arrays() + self._fill_argtypes() + self._resolve_signature() + arys = self._get_actual_args() + return self._broadcast(arys) + + def get_function(self): + """Returns (result_dtype, function) + """ + return self.typemap[self.argtypes] + + def is_device_array(self, obj): + """Is the `obj` a device array? + Override in subclass + """ + return False + + def as_device_array(self, obj): + """Convert the `obj` to a device array + Override in subclass + + Default implementation is an identity function + """ + return obj + + def broadcast_device(self, ary, shape): + """Handles ondevice broadcasting + + Override in subclass to add support. + """ + raise NotImplementedError("broadcasting on device is not supported") + + def force_array_layout(self, ary): + """Ensures array layout met device requirement. + + Override in sublcass + """ + return ary + + @classmethod + def call(cls, typemap, args, kws): + """Perform the entire ufunc call mechanism. + """ + # Handle keywords + stream = kws.pop('stream', cls.DEFAULT_STREAM) + out = kws.pop('out', None) + + if kws: + warnings.warn("unrecognized keywords: %s" % ', '.join(kws)) + + # Begin call resolution + cr = cls(typemap, args) + args = cr.get_arguments() + resty, func = cr.get_function() + + outshape = args[0].shape + + # Adjust output value + if out is not None and cr.is_device_array(out): + out = cr.as_device_array(out) + + def attempt_ravel(a): + if cr.SUPPORT_DEVICE_SLICING: + raise NotImplementedError + + try: + # Call the `.ravel()` method + return a.ravel() + except NotImplementedError: + # If it is not a device array + if not cr.is_device_array(a): + raise + # For device array, retry ravel on the host by first + # copying it back. + else: + hostary = cr.to_host(a, stream).ravel() + return cr.to_device(hostary, stream) + + if args[0].ndim > 1: + args = [attempt_ravel(a) for a in args] + + # Prepare argument on the device + devarys = [] + any_device = False + for a in args: + if cr.is_device_array(a): + devarys.append(a) + any_device = True + else: + dev_a = cr.to_device(a, stream=stream) + devarys.append(dev_a) + + # Launch + shape = args[0].shape + if out is None: + # No output is provided + devout = cr.allocate_device_array(shape, resty, stream=stream) + + devarys.extend([devout]) + cr.launch(func, shape[0], stream, devarys) + + if any_device: + # If any of the arguments are on device, + # Keep output on the device + return devout.reshape(outshape) + else: + # Otherwise, transfer output back to host + return devout.copy_to_host().reshape(outshape) + + elif cr.is_device_array(out): + # If output is provided and it is a device array, + # Return device array + if out.ndim > 1: + out = attempt_ravel(out) + devout = out + devarys.extend([devout]) + cr.launch(func, shape[0], stream, devarys) + return devout.reshape(outshape) + + else: + # If output is provided and it is a host array, + # Return host array + assert out.shape == shape + assert out.dtype == resty + devout = cr.allocate_device_array(shape, resty, stream=stream) + devarys.extend([devout]) + cr.launch(func, shape[0], stream, devarys) + return devout.copy_to_host(out, stream=stream).reshape(outshape) + + def to_device(self, hostary, stream): + """Implement to device transfer + Override in subclass + """ + raise NotImplementedError + + def to_host(self, devary, stream): + """Implement to host transfer + Override in subclass + """ + raise NotImplementedError + + def allocate_device_array(self, shape, dtype, stream): + """Implements device allocation + Override in subclass + """ + raise NotImplementedError + + def launch(self, func, count, stream, args): + """Implements device function invocation + Override in subclass + """ + raise NotImplementedError + + +def to_dtype(ty): + if isinstance(ty, types.EnumMember): + ty = ty.dtype + return np.dtype(str(ty)) + + +class DeviceVectorize(_BaseUFuncBuilder): + def __init__(self, func, identity=None, cache=False, targetoptions={}): + if cache: + raise TypeError("caching is not supported") + for opt in targetoptions: + if opt == 'nopython': + warnings.warn("nopython kwarg for cuda target is redundant", + RuntimeWarning) + else: + fmt = "Unrecognized options. " + fmt += "cuda vectorize target does not support option: '%s'" + raise KeyError(fmt % opt) + self.py_func = func + self.identity = parse_identity(identity) + # { arg_dtype: (return_dtype), cudakernel } + self.kernelmap = OrderedDict() + + @property + def pyfunc(self): + return self.py_func + + def add(self, sig=None): + # compile core as device function + args, return_type = sigutils.normalize_signature(sig) + devfnsig = signature(return_type, *args) + + funcname = self.pyfunc.__name__ + kernelsource = self._get_kernel_source(self._kernel_template, + devfnsig, funcname) + corefn, return_type = self._compile_core(devfnsig) + glbl = self._get_globals(corefn) + sig = signature(types.void, *([a[:] for a in args] + [return_type[:]])) + exec(kernelsource, glbl) + + stager = glbl['__vectorized_%s' % funcname] + kernel = self._compile_kernel(stager, sig) + + argdtypes = tuple(to_dtype(t) for t in devfnsig.args) + resdtype = to_dtype(return_type) + self.kernelmap[tuple(argdtypes)] = resdtype, kernel + + def build_ufunc(self): + raise NotImplementedError + + def _get_kernel_source(self, template, sig, funcname): + args = ['a%d' % i for i in range(len(sig.args))] + fmts = dict(name=funcname, + args=', '.join(args), + argitems=', '.join('%s[__tid__]' % i for i in args)) + return template.format(**fmts) + + def _compile_core(self, sig): + raise NotImplementedError + + def _get_globals(self, corefn): + raise NotImplementedError + + def _compile_kernel(self, fnobj, sig): + raise NotImplementedError + + +class DeviceGUFuncVectorize(_BaseUFuncBuilder): + def __init__(self, func, sig, identity=None, cache=False, targetoptions={}, + writable_args=()): + if cache: + raise TypeError("caching is not supported") + if writable_args: + raise TypeError("writable_args are not supported") + + # Allow nopython flag to be set. + if not targetoptions.pop('nopython', True): + raise TypeError("nopython flag must be True") + # Are there any more target options? + if targetoptions: + opts = ', '.join([repr(k) for k in targetoptions.keys()]) + fmt = "The following target options are not supported: {0}" + raise TypeError(fmt.format(opts)) + + self.py_func = func + self.identity = parse_identity(identity) + self.signature = sig + self.inputsig, self.outputsig = parse_signature(self.signature) + + # Maps from a tuple of input_dtypes to (output_dtypes, kernel) + self.kernelmap = OrderedDict() + + @property + def pyfunc(self): + return self.py_func + + def add(self, sig=None): + indims = [len(x) for x in self.inputsig] + outdims = [len(x) for x in self.outputsig] + args, return_type = sigutils.normalize_signature(sig) + + # It is only valid to specify types.none as a return type, or to not + # specify the return type (where the "Python None" is the return type) + valid_return_type = return_type in (types.none, None) + if not valid_return_type: + raise TypeError('guvectorized functions cannot return values: ' + f'signature {sig} specifies {return_type} return ' + 'type') + + funcname = self.py_func.__name__ + src = expand_gufunc_template(self._kernel_template, indims, + outdims, funcname, args) + + glbls = self._get_globals(sig) + + exec(src, glbls) + fnobj = glbls['__gufunc_{name}'.format(name=funcname)] + + outertys = list(_determine_gufunc_outer_types(args, indims + outdims)) + kernel = self._compile_kernel(fnobj, sig=tuple(outertys)) + + nout = len(outdims) + dtypes = [np.dtype(str(t.dtype)) for t in outertys] + indtypes = tuple(dtypes[:-nout]) + outdtypes = tuple(dtypes[-nout:]) + + self.kernelmap[indtypes] = outdtypes, kernel + + def _compile_kernel(self, fnobj, sig): + raise NotImplementedError + + def _get_globals(self, sig): + raise NotImplementedError + + +def _determine_gufunc_outer_types(argtys, dims): + for at, nd in zip(argtys, dims): + if isinstance(at, types.Array): + yield at.copy(ndim=nd + 1) + else: + if nd > 0: + raise ValueError("gufunc signature mismatch: ndim>0 for scalar") + yield types.Array(dtype=at, ndim=1, layout='A') + + +def expand_gufunc_template(template, indims, outdims, funcname, argtypes): + """Expand gufunc source template + """ + argdims = indims + outdims + argnames = ["arg{0}".format(i) for i in range(len(argdims))] + checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a) + for a in argnames])) + inputs = [_gen_src_for_indexing(aref, adims, atype) + for aref, adims, atype in zip(argnames, indims, argtypes)] + outputs = [_gen_src_for_indexing(aref, adims, atype) + for aref, adims, atype in zip(argnames[len(indims):], outdims, + argtypes[len(indims):])] + argitems = inputs + outputs + src = template.format(name=funcname, args=', '.join(argnames), + checkedarg=checkedarg, + argitems=', '.join(argitems)) + return src + + +def _gen_src_for_indexing(aref, adims, atype): + return "{aref}[{sliced}]".format(aref=aref, + sliced=_gen_src_index(adims, atype)) + + +def _gen_src_index(adims, atype): + if adims > 0: + return ','.join(['__tid__'] + [':'] * adims) + elif isinstance(atype, types.Array) and atype.ndim - 1 == adims: + # Special case for 0-nd in shape-signature but + # 1d array in type signature. + # Slice it so that the result has the same dimension. + return '__tid__:(__tid__ + 1)' + else: + return '__tid__' + + +class GUFuncEngine(object): + '''Determine how to broadcast and execute a gufunc + base on input shape and signature + ''' + + @classmethod + def from_signature(cls, signature): + return cls(*parse_signature(signature)) + + def __init__(self, inputsig, outputsig): + # signatures + self.sin = inputsig + self.sout = outputsig + # argument count + self.nin = len(self.sin) + self.nout = len(self.sout) + + def schedule(self, ishapes): + if len(ishapes) != self.nin: + raise TypeError('invalid number of input argument') + + # associate symbol values for input signature + symbolmap = {} + outer_shapes = [] + inner_shapes = [] + + for argn, (shape, symbols) in enumerate(zip(ishapes, self.sin)): + argn += 1 # start from 1 for human + inner_ndim = len(symbols) + if len(shape) < inner_ndim: + fmt = "arg #%d: insufficient inner dimension" + raise ValueError(fmt % (argn,)) + if inner_ndim: + inner_shape = shape[-inner_ndim:] + outer_shape = shape[:-inner_ndim] + else: + inner_shape = () + outer_shape = shape + + for axis, (dim, sym) in enumerate(zip(inner_shape, symbols)): + axis += len(outer_shape) + if sym in symbolmap: + if symbolmap[sym] != dim: + fmt = "arg #%d: shape[%d] mismatch argument" + raise ValueError(fmt % (argn, axis)) + symbolmap[sym] = dim + + outer_shapes.append(outer_shape) + inner_shapes.append(inner_shape) + + # solve output shape + oshapes = [] + for outsig in self.sout: + oshape = [] + for sym in outsig: + oshape.append(symbolmap[sym]) + oshapes.append(tuple(oshape)) + + # find the biggest outershape as looping dimension + sizes = [reduce(operator.mul, s, 1) for s in outer_shapes] + largest_i = np.argmax(sizes) + loopdims = outer_shapes[largest_i] + + pinned = [False] * self.nin # same argument for each iteration + for i, d in enumerate(outer_shapes): + if d != loopdims: + if d == (1,) or d == (): + pinned[i] = True + else: + fmt = "arg #%d: outer dimension mismatch" + raise ValueError(fmt % (i + 1,)) + + return GUFuncSchedule(self, inner_shapes, oshapes, loopdims, pinned) + + +class GUFuncSchedule(object): + def __init__(self, parent, ishapes, oshapes, loopdims, pinned): + self.parent = parent + # core shapes + self.ishapes = ishapes + self.oshapes = oshapes + # looping dimension + self.loopdims = loopdims + self.loopn = reduce(operator.mul, loopdims, 1) + # flags + self.pinned = pinned + + self.output_shapes = [loopdims + s for s in oshapes] + + def __str__(self): + import pprint + + attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned' + values = [(k, getattr(self, k)) for k in attrs] + return pprint.pformat(dict(values)) + + +class GeneralizedUFunc(object): + def __init__(self, kernelmap, engine): + self.kernelmap = kernelmap + self.engine = engine + self.max_blocksize = 2 ** 30 + + def __call__(self, *args, **kws): + callsteps = self._call_steps(self.engine.nin, self.engine.nout, + args, kws) + indtypes, schedule, outdtypes, kernel = self._schedule( + callsteps.inputs, callsteps.outputs) + callsteps.adjust_input_types(indtypes) + + outputs = callsteps.prepare_outputs(schedule, outdtypes) + inputs = callsteps.prepare_inputs() + parameters = self._broadcast(schedule, inputs, outputs) + + callsteps.launch_kernel(kernel, schedule.loopn, parameters) + + return callsteps.post_process_outputs(outputs) + + def _schedule(self, inputs, outs): + input_shapes = [a.shape for a in inputs] + schedule = self.engine.schedule(input_shapes) + + # find kernel + indtypes = tuple(i.dtype for i in inputs) + try: + outdtypes, kernel = self.kernelmap[indtypes] + except KeyError: + # No exact match, then use the first compatible. + # This does not match the numpy dispatching exactly. + # Later, we may just jit a new version for the missing signature. + indtypes = self._search_matching_signature(indtypes) + # Select kernel + outdtypes, kernel = self.kernelmap[indtypes] + + # check output + for sched_shape, out in zip(schedule.output_shapes, outs): + if out is not None and sched_shape != out.shape: + raise ValueError('output shape mismatch') + + return indtypes, schedule, outdtypes, kernel + + def _search_matching_signature(self, idtypes): + """ + Given the input types in `idtypes`, return a compatible sequence of + types that is defined in `kernelmap`. + + Note: Ordering is guaranteed by `kernelmap` being a OrderedDict + """ + for sig in self.kernelmap.keys(): + if all(np.can_cast(actual, desired) + for actual, desired in zip(sig, idtypes)): + return sig + else: + raise TypeError("no matching signature") + + def _broadcast(self, schedule, params, retvals): + assert schedule.loopn > 0, "zero looping dimension" + + odim = 1 if not schedule.loopdims else schedule.loopn + newparams = [] + for p, cs in zip(params, schedule.ishapes): + if not cs and p.size == 1: + # Broadcast scalar input + devary = self._broadcast_scalar_input(p, odim) + newparams.append(devary) + else: + # Broadcast vector input + newparams.append(self._broadcast_array(p, odim, cs)) + + newretvals = [] + for retval, oshape in zip(retvals, schedule.oshapes): + newretvals.append(retval.reshape(odim, *oshape)) + return tuple(newparams) + tuple(newretvals) + + def _broadcast_array(self, ary, newdim, innerdim): + newshape = (newdim,) + innerdim + # No change in shape + if ary.shape == newshape: + return ary + + # Creating new dimension + elif len(ary.shape) < len(newshape): + assert newshape[-len(ary.shape):] == ary.shape, \ + "cannot add dim and reshape at the same time" + return self._broadcast_add_axis(ary, newshape) + + # Collapsing dimension + else: + return ary.reshape(*newshape) + + def _broadcast_add_axis(self, ary, newshape): + raise NotImplementedError("cannot add new axis") + + def _broadcast_scalar_input(self, ary, shape): + raise NotImplementedError + + +class GUFuncCallSteps(metaclass=ABCMeta): + """ + Implements memory management and kernel launch operations for GUFunc calls. + + One instance of this class is instantiated for each call, and the instance + is specific to the arguments given to the GUFunc call. + + The base class implements the overall logic; subclasses provide + target-specific implementations of individual functions. + """ + + # The base class uses these slots; subclasses may provide additional slots. + __slots__ = [ + 'outputs', + 'inputs', + '_copy_result_to_host', + ] + + @abstractmethod + def launch_kernel(self, kernel, nelem, args): + """Implement the kernel launch""" + + @abstractmethod + def is_device_array(self, obj): + """ + Return True if `obj` is a device array for this target, False + otherwise. + """ + + @abstractmethod + def as_device_array(self, obj): + """ + Return `obj` as a device array on this target. + + May return `obj` directly if it is already on the target. + """ + + @abstractmethod + def to_device(self, hostary): + """ + Copy `hostary` to the device and return the device array. + """ + + @abstractmethod + def allocate_device_array(self, shape, dtype): + """ + Allocate a new uninitialized device array with the given shape and + dtype. + """ + + def __init__(self, nin, nout, args, kwargs): + outputs = kwargs.get('out') + + # Ensure the user has passed a correct number of arguments + if outputs is None and len(args) not in (nin, (nin + nout)): + def pos_argn(n): + return f'{n} positional argument{"s" * (n != 1)}' + + msg = (f'This gufunc accepts {pos_argn(nin)} (when providing ' + f'input only) or {pos_argn(nin + nout)} (when providing ' + f'input and output). Got {pos_argn(len(args))}.') + raise TypeError(msg) + + if outputs is not None and len(args) > nin: + raise ValueError("cannot specify argument 'out' as both positional " + "and keyword") + else: + # If the user did not pass outputs either in the out kwarg or as + # positional arguments, then we need to generate an initial list of + # "placeholder" outputs using None as a sentry value + outputs = [outputs] * nout + + # Ensure all output device arrays are Numba device arrays - for + # example, any output passed in that supports the CUDA Array Interface + # is converted to a Numba CUDA device array; others are left untouched. + all_user_outputs_are_host = True + self.outputs = [] + for output in outputs: + if self.is_device_array(output): + self.outputs.append(self.as_device_array(output)) + all_user_outputs_are_host = False + else: + self.outputs.append(output) + + all_host_arrays = not any([self.is_device_array(a) for a in args]) + + # - If any of the arguments are device arrays, we leave the output on + # the device. + self._copy_result_to_host = (all_host_arrays and + all_user_outputs_are_host) + + # Normalize arguments - ensure they are either device- or host-side + # arrays (as opposed to lists, tuples, etc). + def normalize_arg(a): + if self.is_device_array(a): + convert = self.as_device_array + else: + convert = np.asarray + + return convert(a) + + normalized_args = [normalize_arg(a) for a in args] + self.inputs = normalized_args[:nin] + + # Check if there are extra arguments for outputs. + unused_inputs = normalized_args[nin:] + if unused_inputs: + self.outputs = unused_inputs + + def adjust_input_types(self, indtypes): + """ + Attempt to cast the inputs to the required types if necessary + and if they are not device arrays. + + Side effect: Only affects the elements of `inputs` that require + a type cast. + """ + for i, (ity, val) in enumerate(zip(indtypes, self.inputs)): + if ity != val.dtype: + if not hasattr(val, 'astype'): + msg = ("compatible signature is possible by casting but " + "{0} does not support .astype()").format(type(val)) + raise TypeError(msg) + # Cast types + self.inputs[i] = val.astype(ity) + + def prepare_outputs(self, schedule, outdtypes): + """ + Returns a list of output parameters that all reside on the target + device. + + Outputs that were passed-in to the GUFunc are used if they reside on the + device; other outputs are allocated as necessary. + """ + outputs = [] + for shape, dtype, output in zip(schedule.output_shapes, outdtypes, + self.outputs): + if output is None or self._copy_result_to_host: + output = self.allocate_device_array(shape, dtype) + outputs.append(output) + + return outputs + + def prepare_inputs(self): + """ + Returns a list of input parameters that all reside on the target device. + """ + def ensure_device(parameter): + if self.is_device_array(parameter): + convert = self.as_device_array + else: + convert = self.to_device + + return convert(parameter) + + return [ensure_device(p) for p in self.inputs] + + def post_process_outputs(self, outputs): + """ + Moves the given output(s) to the host if necessary. + + Returns a single value (e.g. an array) if there was one output, or a + tuple of arrays if there were multiple. Although this feels a little + jarring, it is consistent with the behavior of GUFuncs in general. + """ + if self._copy_result_to_host: + outputs = [self.to_host(output, self_output) + for output, self_output in zip(outputs, self.outputs)] + elif self.outputs[0] is not None: + outputs = self.outputs + + if len(outputs) == 1: + return outputs[0] + else: + return tuple(outputs) diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu new file mode 100644 index 0000000..6ced244 --- /dev/null +++ b/numba_cuda/numba/cuda/runtime/nrt.cu @@ -0,0 +1,107 @@ +#ifndef _NRT_H +#define _NRT_H + +#include + +typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info); +typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info); + +typedef struct MemInfo NRT_MemInfo; + +extern "C" { +struct MemInfo { + cuda::atomic refct; + NRT_dtor_function dtor; + void* dtor_info; + void* data; + size_t size; +}; +} + +// Globally needed variables +struct NRT_MemSys { + struct { + bool enabled; + cuda::atomic alloc; + cuda::atomic free; + cuda::atomic mi_alloc; + cuda::atomic mi_free; + } stats; +}; + +/* The Memory System object */ +__device__ NRT_MemSys* TheMSys; + +extern "C" __device__ void* NRT_Allocate(size_t size) +{ + void* ptr = NULL; + ptr = malloc(size); +// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; } + return ptr; +} + +extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, + void* data, + size_t size, + NRT_dtor_function dtor, + void* dtor_info) +// NRT_MemSys* TheMSys) +{ + mi->refct = 1; /* starts with 1 refct */ + mi->dtor = dtor; + mi->dtor_info = dtor_info; + mi->data = data; + mi->size = size; +// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; } +} + +__device__ NRT_MemInfo* NRT_MemInfo_new( + void* data, size_t size, NRT_dtor_function dtor, void* dtor_info) +{ + NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo)); + if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); } + return mi; +} + +extern "C" __device__ void NRT_Free(void* ptr) +{ + free(ptr); + //if (TheMSys->stats.enabled) { TheMSys->stats.free++; } +} + +extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) +{ + NRT_Free(mi); +} + +extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi) +{ + NRT_dealloc(mi); + //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; } +} +extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) +{ + if (mi->dtor) /* We have a destructor */ + mi->dtor(mi->data, mi->size, NULL); + /* Clear and release MemInfo */ + NRT_MemInfo_destroy(mi); +} + +/* + c++ version of the NRT_decref function that usually is added to + the final kernel link in PTX form by numba. This version may be + used by c++ APIs that accept ownership of live objects and must + manage them going forward. +*/ +extern "C" __device__ void NRT_decref(NRT_MemInfo* mi) +{ + mi->refct--; + if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); } +} + +#endif + +extern "C" __device__ void NRT_incref(NRT_MemInfo* mi) +{ + mi->refct++; +} diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py new file mode 100644 index 0000000..adabaa7 --- /dev/null +++ b/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py @@ -0,0 +1,4 @@ +# Dummy arrays are not implemented in the simulator. This file allows the dummy +# array tests to be imported, but they are skipped on the simulator. + +Array = None diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 6402ff3..f43b099 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -67,6 +67,7 @@ def resolve_value_type(self, val): class CUDATargetContext(BaseContext): implement_powi_as_math_call = True strict_alignment = True + enable_nrt = True def __init__(self, typingctx, target='cuda'): super().__init__(typingctx, target) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py index 2a240ef..fb8de32 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py @@ -1,4 +1,4 @@ -from numba.np.ufunc.deviceufunc import GUFuncEngine +from numba.cuda.deviceufunc import GUFuncEngine import unittest diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py index e88bbc9..657e9a1 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py @@ -145,7 +145,8 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False): if driver.USE_NV_BINDING: self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) else: - self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle)) + self.assertEqual(ipch_recon.handle.reserved[:], + ipch.handle.reserved[:]) # spawn new process for testing ctx = mp.get_context('spawn') @@ -262,7 +263,10 @@ def test_staged(self): if driver.USE_NV_BINDING: self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) else: - self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle)) + self.assertEqual( + ipch_recon.handle.reserved[:], + ipch.handle.reserved[:] + ) self.assertEqual(ipch_recon.size, ipch.size) # Test on every CUDA devices diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py new file mode 100644 index 0000000..e4ad7d0 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py @@ -0,0 +1,359 @@ +import unittest +import itertools +import numpy as np +from numba.cuda.cudadrv.dummyarray import Array +from numba.cuda.testing import skip_on_cudasim + + +@skip_on_cudasim("Tests internals of the CUDA driver device array") +class TestSlicing(unittest.TestCase): + + def assertSameContig(self, arr, nparr): + attrs = 'C_CONTIGUOUS', 'F_CONTIGUOUS' + for attr in attrs: + if arr.flags[attr] != nparr.flags[attr]: + if arr.size == 0 and nparr.size == 0: + # numpy <=1.7 bug that some empty array are contiguous and + # some are not + pass + else: + self.fail("contiguous flag mismatch:\ngot=%s\nexpect=%s" % + (arr.flags, nparr.flags)) + + #### 1D + + def test_slice0_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + self.assertSameContig(arr, nparr) + xx = -2, -1, 0, 1, 2 + for x in xx: + expect = nparr[x:] + got = arr[x:] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_slice1_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, -1, 0, 1, 2 + for x in xx: + expect = nparr[:x] + got = arr[:x] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_slice2_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, -1, 0, 1, 2 + for x, y in itertools.product(xx, xx): + expect = nparr[x:y] + got = arr[x:y] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + #### 2D + + def test_slice0_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, 0, 1, 2 + for x in xx: + expect = nparr[x:] + got = arr[x:] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + for x, y in itertools.product(xx, xx): + expect = nparr[x:, y:] + got = arr[x:, y:] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_slice1_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, 0, 2 + for x in xx: + expect = nparr[:x] + got = arr[:x] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + self.assertSameContig(got, expect) + + for x, y in itertools.product(xx, xx): + expect = nparr[:x, :y] + got = arr[:x, :y] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + self.assertSameContig(got, expect) + + def test_slice2_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, 0, 2 + for s, t, u, v in itertools.product(xx, xx, xx, xx): + expect = nparr[s:t, u:v] + got = arr[s:t, u:v] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + for x, y in itertools.product(xx, xx): + expect = nparr[s:t, u:v] + got = arr[s:t, u:v] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + #### Strided + + def test_strided_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, -1, 1, 2 + for x in xx: + expect = nparr[::x] + got = arr[::x] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_strided_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, -1, 1, 2 + for a, b in itertools.product(xx, xx): + expect = nparr[::a, ::b] + got = arr[::a, ::b] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_strided_3d(self): + nparr = np.empty((4, 5, 6)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + xx = -2, -1, 1, 2 + for a, b, c in itertools.product(xx, xx, xx): + expect = nparr[::a, ::b, ::c] + got = arr[::a, ::b, ::c] + self.assertSameContig(got, expect) + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_issue_2766(self): + z = np.empty((1, 2, 3)) + z = np.transpose(z, axes=(2, 0, 1)) + arr = Array.from_desc(0, z.shape, z.strides, z.itemsize) + self.assertEqual(z.flags['C_CONTIGUOUS'], arr.flags['C_CONTIGUOUS']) + self.assertEqual(z.flags['F_CONTIGUOUS'], arr.flags['F_CONTIGUOUS']) + + +@skip_on_cudasim("Tests internals of the CUDA driver device array") +class TestReshape(unittest.TestCase): + def test_reshape_2d2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(5, 4) + got = arr.reshape(5, 4)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_2d1d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(5 * 4) + got = arr.reshape(5 * 4)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_3d3d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(5, 3, 4) + got = arr.reshape(5, 3, 4)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_3d2d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(3 * 4, 5) + got = arr.reshape(3 * 4, 5)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_3d1d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(3 * 4 * 5) + got = arr.reshape(3 * 4 * 5)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer2d2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(-1, 4) + got = arr.reshape(-1, 4)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer2d1d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(-1) + got = arr.reshape(-1)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer3d3d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(5, -1, 4) + got = arr.reshape(5, -1, 4)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer3d2d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(3, -1) + got = arr.reshape(3, -1)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer3d1d(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + expect = nparr.reshape(-1) + got = arr.reshape(-1)[0] + self.assertEqual(got.shape, expect.shape) + self.assertEqual(got.strides, expect.strides) + + def test_reshape_infer_two_unknowns(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + + with self.assertRaises(ValueError) as raises: + arr.reshape(-1, -1, 3) + self.assertIn('can only specify one unknown dimension', + str(raises.exception)) + + def test_reshape_infer_invalid_shape(self): + nparr = np.empty((3, 4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + + with self.assertRaises(ValueError) as raises: + arr.reshape(-1, 7) + + expected_message = 'cannot infer valid shape for unknown dimension' + self.assertIn(expected_message, str(raises.exception)) + + +@skip_on_cudasim("Tests internals of the CUDA driver device array") +class TestSqueeze(unittest.TestCase): + def test_squeeze(self): + nparr = np.empty((1, 2, 1, 4, 1, 3)) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) + + def _assert_equal_shape_strides(arr1, arr2): + self.assertEqual(arr1.shape, arr2.shape) + self.assertEqual(arr1.strides, arr2.strides) + _assert_equal_shape_strides(arr, nparr) + _assert_equal_shape_strides(arr.squeeze()[0], nparr.squeeze()) + for axis in (0, 2, 4, (0, 2), (0, 4), (2, 4), (0, 2, 4)): + _assert_equal_shape_strides( + arr.squeeze(axis=axis)[0], nparr.squeeze(axis=axis) + ) + + def test_squeeze_invalid_axis(self): + nparr = np.empty((1, 2, 1, 4, 1, 3)) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) + with self.assertRaises(ValueError): + arr.squeeze(axis=1) + with self.assertRaises(ValueError): + arr.squeeze(axis=(2, 3)) + + +@skip_on_cudasim("Tests internals of the CUDA driver device array") +class TestExtent(unittest.TestCase): + def test_extent_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + s, e = arr.extent + self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize) + + def test_extent_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + s, e = arr.extent + self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize) + + def test_extent_iter_1d(self): + nparr = np.empty(4) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + [ext] = list(arr.iter_contiguous_extent()) + self.assertEqual(ext, arr.extent) + + def test_extent_iter_2d(self): + nparr = np.empty((4, 5)) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + [ext] = list(arr.iter_contiguous_extent()) + self.assertEqual(ext, arr.extent) + + self.assertEqual(len(list(arr[::2].iter_contiguous_extent())), 2) + + +@skip_on_cudasim("Tests internals of the CUDA driver device array") +class TestIterate(unittest.TestCase): + def test_for_loop(self): + # for #4201 + N = 5 + nparr = np.empty(N) + arr = Array.from_desc(0, nparr.shape, nparr.strides, + nparr.dtype.itemsize) + + x = 0 # just a placeholder + # this loop should not raise AssertionError + for val in arr: + x = val # noqa: F841 + + +if __name__ == '__main__': + unittest.main() diff --git a/numba_cuda/numba/cuda/vectorizers.py b/numba_cuda/numba/cuda/vectorizers.py index 2565988..b4c6bcf 100644 --- a/numba_cuda/numba/cuda/vectorizers.py +++ b/numba_cuda/numba/cuda/vectorizers.py @@ -1,8 +1,8 @@ from numba import cuda from numpy import array as np_array -from numba.np.ufunc import deviceufunc -from numba.np.ufunc.deviceufunc import (UFuncMechanism, GeneralizedUFunc, - GUFuncCallSteps) +from numba.cuda import deviceufunc +from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc, + GUFuncCallSteps) class CUDAUFuncDispatcher(object): diff --git a/pyproject.toml b/pyproject.toml index 8bb38e4..4f2291b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,14 @@ -[tool.scikit-build] -cmake.minimum-version = "3.26.4" -cmake.verbose = true -ninja.make-fallback = true -build-dir = "build/{wheel_tag}" -wheel.packages = ["numba_cuda"] +# Copyright (c) 2023-2024, NVIDIA CORPORATION. [build-system] -requires = ["scikit-build-core"] -build-backend = "scikit_build_core.build" +build-backend = "setuptools.build_meta" +requires = [ + "setuptools", + "wheel", +] [project] -name = "numba_cuda" +name = "numba-cuda" dynamic = ["version"] description = "CUDA target for Numba" readme = { file = "README.md", content-type = "text/markdown" } @@ -20,6 +18,7 @@ authors = [ ] license = { text = "BSD 2-clause" } requires-python = ">=3.9" +dependencies = ["numba>=0.59.1"] [project.urls] Homepage = "https://github.com/rapidsai/numba-cuda" @@ -27,7 +26,14 @@ Documentation = "https://github.com/rapidsai/numba-cuda/blob/main/README.md" Repository = "https://github.com/rapidsai/numba-cuda" License = "https://github.com/rapidsai/numba-cuda/blob/main/LICENSE" -[tool.scikit-build.metadata.version] -provider = "scikit_build_core.metadata.regex" -input = "numba_cuda/VERSION" -regex = "(?P.*)" +[tool.setuptools.dynamic] +version = {attr = "numba_cuda.__version__"} + +[tool.setuptools] +license-files = ["LICENSE"] + +[tool.setuptools.packages.find] +include = ["numba_cuda*"] + +[tool.setuptools.package-data] +"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION"] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..98a1061 --- /dev/null +++ b/setup.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pathlib + +from setuptools import setup +from setuptools.command.build_py import build_py +from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder + +REDIRECTOR_PTH = "_numba_cuda_redirector.pth" +REDIRECTOR_PY = "_numba_cuda_redirector.py" +SITE_PACKAGES = pathlib.Path("site-packages") + + +# Adapted from https://stackoverflow.com/a/71137790 +class build_py_with_redirector(build_py): # noqa: N801 + """Include the redirector files in the generated wheel.""" + + def copy_redirector_file(self, source, destination="."): + destination = pathlib.Path(self.build_lib) / destination + self.copy_file(str(source), str(destination), preserve_mode=0) + + def run(self): + super().run() + self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PTH) + self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PY) + + def get_source_files(self): + src = super().get_source_files() + src.extend([ + str(SITE_PACKAGES / REDIRECTOR_PTH), + str(SITE_PACKAGES / REDIRECTOR_PY), + ]) + return src + + def get_output_mapping(self): + mapping = super().get_output_mapping() + build_lib = pathlib.Path(self.build_lib) + mapping[str(build_lib / REDIRECTOR_PTH)] = REDIRECTOR_PTH + mapping[str(build_lib / REDIRECTOR_PY)] = REDIRECTOR_PY + return mapping + + +class TopLevelFinderWithRedirector(_TopLevelFinder): + """Include the redirector files in the editable wheel.""" + + def get_implementation(self): + for item in super().get_implementation(): + yield item + + with open(SITE_PACKAGES / REDIRECTOR_PTH) as f: + yield (REDIRECTOR_PTH, f.read()) + + with open(SITE_PACKAGES / REDIRECTOR_PY) as f: + yield (REDIRECTOR_PY, f.read()) + + +class editable_wheel_with_redirector(editable_wheel): + def _select_strategy(self, name, tag, build_lib): + # The default mode is "lenient" - others are "strict" and "compat". + # "compat" is deprecated. "strict" creates a tree of links to files in + # the repo. It could be implemented, but we only handle the default + # case for now. + if self.mode is not None and self.mode != "lenient": + raise RuntimeError("Only lenient mode is supported for editable " + f"install. Current mode is {self.mode}") + + return TopLevelFinderWithRedirector(self.distribution, name) + + +setup(cmdclass={"build_py": build_py_with_redirector, + "editable_wheel": editable_wheel_with_redirector}) diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py index 93e020a..3765fd1 100644 --- a/site-packages/_numba_cuda_redirector.py +++ b/site-packages/_numba_cuda_redirector.py @@ -47,8 +47,10 @@ def ensure_initialized(self): return False self.numba_path = numba_search_locations[0] - self.numba_cuda_path = str((pathlib.Path(numba_cuda_search_locations[0]) / - 'numba')) + + location = numba_cuda_search_locations[0] + self.numba_cuda_path = str((pathlib.Path(location) / 'numba')) + self.initialized = True return True From 60d4ca7a2e29708be90028308c2d7a66d5d176c6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 4 Aug 2024 10:59:30 -0700 Subject: [PATCH 08/52] style and other fixes --- numba_cuda/numba/cuda/cudadrv/driver.py | 36 ++++++++++++++----- .../numba/cuda/cudadrv/linkable_code.py | 1 + 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index d9d1ded..70a8c7a 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -36,6 +36,7 @@ from .drvapi import API_PROTOTYPES from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid from .mappings import FILE_EXTENSION_MAP +from .linkable_code import LinkableCode from numba.cuda.cudadrv import enums, drvapi, nvrtc USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING @@ -56,6 +57,12 @@ _py_decref.argtypes = [ctypes.py_object] _py_incref.argtypes = [ctypes.py_object] +pynvjitlink_import_err = None +try: + from pynvjitlink.api import NvJitLinker, NvJitLinkError +except ImportError as err: + pynvjitlink_import_err = err + def make_logger(): logger = logging.getLogger(__name__) @@ -80,6 +87,7 @@ def make_logger(): logger.addHandler(logging.NullHandler()) return logger + class DeadMemoryError(RuntimeError): pass @@ -2550,14 +2558,24 @@ class Linker(metaclass=ABCMeta): """Abstract base class for linkers""" @classmethod - def new(cls, max_registers=0, lineinfo=False, cc=None, lto=None, additional_flags=None): + def new(cls, + max_registers=0, + lineinfo=False, + cc=None, + lto=None, + additional_flags=None + ): if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: # TODO: circular from . import runtime - driver_ver, runtime_ver = driver.get_version(), runtime.get_version() + driver_ver, runtime_ver = ( + driver.get_version(), runtime.get_version() + ) if driver_ver >= (12, 0) and runtime_ver > driver_ver: # runs once - return PyNvJitLinker(max_registers, lineinfo, cc, lto, additional_flags) + return PyNvJitLinker( + max_registers, lineinfo, cc, lto, additional_flags + ) else: return MVCLinker(max_registers, lineinfo, cc) @@ -2916,6 +2934,7 @@ def complete(self): cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char)) return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,))) + class PyNvJitLinker(Linker): def __init__( self, @@ -2925,11 +2944,8 @@ def __init__( lto=False, additional_flags=None, ): - try: - from pynvjitlink.api import NvJitLinker, NvJitLinkError - except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE) from err - + if pynvjitlink_import_err is not None: + raise ImportError(_MVC_ERROR_MESSAGE) if cc is None: raise RuntimeError("PyNvJitLinker requires CC to be specified") if not any(isinstance(cc, t) for t in [list, tuple]): @@ -2997,7 +3013,9 @@ def add_file_guess_ext(self, path_or_code): if path_or_code.kind == "cu": self.add_cu(path_or_code.data, path_or_code.name) else: - self.add_data(path_or_code.data, path_or_code.kind, path_or_code.name) + self.add_data( + path_or_code.data, path_or_code.kind, path_or_code.name + ) def add_file(self, path, kind): try: diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py index 5280829..1c1ab4d 100644 --- a/numba_cuda/numba/cuda/cudadrv/linkable_code.py +++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py @@ -1,5 +1,6 @@ from .mappings import FILE_EXTENSION_MAP + class LinkableCode: """An object that can be passed in the `link` list argument to `@cuda.jit` kernels to supply code to be linked from memory.""" From 56db9c83d46e84a0aa7b36f6e56a94e018d8adb7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 8 Aug 2024 07:50:10 -0700 Subject: [PATCH 09/52] cleanup --- numba_cuda/numba/cuda/runtime/nrt.cu | 107 --------------------------- numba_cuda/numba/cuda/target.py | 1 - 2 files changed, 108 deletions(-) delete mode 100644 numba_cuda/numba/cuda/runtime/nrt.cu diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu deleted file mode 100644 index 6ced244..0000000 --- a/numba_cuda/numba/cuda/runtime/nrt.cu +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _NRT_H -#define _NRT_H - -#include - -typedef __device__ void (*NRT_dtor_function)(void* ptr, size_t size, void* info); -typedef __device__ void (*NRT_dealloc_func)(void* ptr, void* dealloc_info); - -typedef struct MemInfo NRT_MemInfo; - -extern "C" { -struct MemInfo { - cuda::atomic refct; - NRT_dtor_function dtor; - void* dtor_info; - void* data; - size_t size; -}; -} - -// Globally needed variables -struct NRT_MemSys { - struct { - bool enabled; - cuda::atomic alloc; - cuda::atomic free; - cuda::atomic mi_alloc; - cuda::atomic mi_free; - } stats; -}; - -/* The Memory System object */ -__device__ NRT_MemSys* TheMSys; - -extern "C" __device__ void* NRT_Allocate(size_t size) -{ - void* ptr = NULL; - ptr = malloc(size); -// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; } - return ptr; -} - -extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, - void* data, - size_t size, - NRT_dtor_function dtor, - void* dtor_info) -// NRT_MemSys* TheMSys) -{ - mi->refct = 1; /* starts with 1 refct */ - mi->dtor = dtor; - mi->dtor_info = dtor_info; - mi->data = data; - mi->size = size; -// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; } -} - -__device__ NRT_MemInfo* NRT_MemInfo_new( - void* data, size_t size, NRT_dtor_function dtor, void* dtor_info) -{ - NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo)); - if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); } - return mi; -} - -extern "C" __device__ void NRT_Free(void* ptr) -{ - free(ptr); - //if (TheMSys->stats.enabled) { TheMSys->stats.free++; } -} - -extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) -{ - NRT_Free(mi); -} - -extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi) -{ - NRT_dealloc(mi); - //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; } -} -extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) -{ - if (mi->dtor) /* We have a destructor */ - mi->dtor(mi->data, mi->size, NULL); - /* Clear and release MemInfo */ - NRT_MemInfo_destroy(mi); -} - -/* - c++ version of the NRT_decref function that usually is added to - the final kernel link in PTX form by numba. This version may be - used by c++ APIs that accept ownership of live objects and must - manage them going forward. -*/ -extern "C" __device__ void NRT_decref(NRT_MemInfo* mi) -{ - mi->refct--; - if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); } -} - -#endif - -extern "C" __device__ void NRT_incref(NRT_MemInfo* mi) -{ - mi->refct++; -} diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index f43b099..6402ff3 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -67,7 +67,6 @@ def resolve_value_type(self, val): class CUDATargetContext(BaseContext): implement_powi_as_math_call = True strict_alignment = True - enable_nrt = True def __init__(self, typingctx, target='cuda'): super().__init__(typingctx, target) From c57053c7b94b2b7b9f551eee226c7408c318d534 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 8 Aug 2024 07:51:59 -0700 Subject: [PATCH 10/52] bifurcate error messages --- numba_cuda/numba/cuda/cudadrv/driver.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 70a8c7a..b6048a3 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2654,11 +2654,15 @@ def complete(self): """ -_MVC_ERROR_MESSAGE = ( +_MVC_ERROR_MESSAGE_CU11 = ( "Minor version compatibility requires ptxcompiler and cubinlinker packages " "to be available" ) +_MVC_ERROR_MESSAGE_CU12 = ( + "Minor version compatibility requires pynvjitlink package to be available" +) + class MVCLinker(Linker): """ @@ -2669,7 +2673,7 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None): try: from cubinlinker import CubinLinker except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE) from err + raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err if cc is None: raise RuntimeError("MVCLinker requires Compute Capability to be " @@ -2701,7 +2705,7 @@ def add_ptx(self, ptx, name=''): from ptxcompiler import compile_ptx from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE) from err + raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err compile_result = compile_ptx(ptx.decode(), self.ptx_compile_options) try: self._linker.add_cubin(compile_result.compiled_program, name) @@ -2712,7 +2716,7 @@ def add_file(self, path, kind): try: from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE) from err + raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err try: with open(path, 'rb') as f: @@ -2741,7 +2745,7 @@ def complete(self): try: from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE) from err + raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err try: return self._linker.complete() @@ -2945,7 +2949,7 @@ def __init__( additional_flags=None, ): if pynvjitlink_import_err is not None: - raise ImportError(_MVC_ERROR_MESSAGE) + raise ImportError(_MVC_ERROR_MESSAGE_CU12) if cc is None: raise RuntimeError("PyNvJitLinker requires CC to be specified") if not any(isinstance(cc, t) for t in [list, tuple]): From 363b86dd4e5be5982cfcd85aa9b9858fa3bc5bf2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 12 Aug 2024 06:05:13 -0700 Subject: [PATCH 11/52] partially address reviews --- numba_cuda/numba/cuda/cudadrv/driver.py | 64 +++++++++++++++++-------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index b6048a3..0748931 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -19,6 +19,7 @@ import warnings import logging import threading +import traceback import asyncio import pathlib from itertools import product @@ -64,6 +65,38 @@ pynvjitlink_import_err = err +def _readenv(name, ctor, default): + value = os.environ.get(name) + if value is None: + return default() if callable(default) else default + try: + return ctor(value) + except Exception: + warnings.warn( + f"Environment variable '{name}' is defined but its associated " + f"value '{value}' could not be parsed.\n" + "The parse failed with exception:\n" + f"{traceback.format_exc()}", + RuntimeWarning + ) + return default + + +config.ENABLE_PYNVJITLINK = False +if _readenv("ENABLE_PYNVJITLINK", bool, False): + config.ENABLE_PYNVJITLINK = True + + +_MVC_ERROR_MESSAGE_CU11 = ( + "Minor version compatibility requires ptxcompiler and cubinlinker packages " + "to be available" +) + +_MVC_ERROR_MESSAGE_CU12 = ( + "Minor version compatibility requires pynvjitlink package to be available" +) + + def make_logger(): logger = logging.getLogger(__name__) # is logging configured? @@ -440,7 +473,7 @@ def get_active_context(self): def get_version(self): """ - Returns the CUDA Runtime version as a tuple (major, minor). + Returns the CUDA Driver version as a tuple (major, minor). """ if USE_NV_BINDING: version = driver.cuDriverGetVersion() @@ -2566,23 +2599,26 @@ def new(cls, additional_flags=None ): if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: - # TODO: circular - from . import runtime + from numba.cuda.cudadrv import runtime driver_ver, runtime_ver = ( driver.get_version(), runtime.get_version() ) if driver_ver >= (12, 0) and runtime_ver > driver_ver: # runs once - return PyNvJitLinker( - max_registers, lineinfo, cc, lto, additional_flags - ) + linker = PyNvJitLinker else: - return MVCLinker(max_registers, lineinfo, cc) + linker = MVCLinker elif USE_NV_BINDING: - return CudaPythonLinker(max_registers, lineinfo, cc) + linker = CudaPythonLinker + else: + linker = CtypesLinker + if linker is PyNvJitLinker: + return linker(max_registers, lineinfo, cc, lto, additional_flags) + elif additional_flags or lto: + raise ValueError("LTO and additional flags require PyNvJitLinker") else: - return CtypesLinker(max_registers, lineinfo, cc) + return linker(max_registers, lineinfo, cc) @abstractmethod def __init__(self, max_registers, lineinfo, cc): @@ -2654,16 +2690,6 @@ def complete(self): """ -_MVC_ERROR_MESSAGE_CU11 = ( - "Minor version compatibility requires ptxcompiler and cubinlinker packages " - "to be available" -) - -_MVC_ERROR_MESSAGE_CU12 = ( - "Minor version compatibility requires pynvjitlink package to be available" -) - - class MVCLinker(Linker): """ Linker supporting Minor Version Compatibility, backed by the cubinlinker From 32164e92e775173abcb077fff97716cc63ec5548 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 12 Aug 2024 07:05:43 -0700 Subject: [PATCH 12/52] move add_file_guess_ext logic to Linker base class --- numba_cuda/numba/cuda/cudadrv/driver.py | 73 ++++++++++------------- numba_cuda/numba/cuda/cudadrv/enums.py | 5 +- numba_cuda/numba/cuda/cudadrv/mappings.py | 1 + 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 0748931..fe46630 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2667,19 +2667,38 @@ def add_cu_file(self, path): cu = f.read() self.add_cu(cu, os.path.basename(path)) - def add_file_guess_ext(self, path): + def add_file_guess_ext(self, path_or_code): """Add a file to the link, guessing its type from its extension.""" - ext = os.path.splitext(path)[1][1:] - if ext == '': - raise RuntimeError("Don't know how to link file with no extension") - elif ext == 'cu': - self.add_cu_file(path) - else: - kind = FILE_EXTENSION_MAP.get(ext, None) - if kind is None: - raise RuntimeError("Don't know how to link file with extension " - f".{ext}") - self.add_file(path, kind) + if isinstance(path_or_code, str): + ext = pathlib.Path(path_or_code).suffix + if ext == '': + raise RuntimeError( + "Don't know how to link file with no extension" + ) + elif ext == '.cu': + self.add_cu_file(path_or_code) + else: + kind = FILE_EXTENSION_MAP.get(ext, None) + if kind is None: + raise RuntimeError( + "Don't know how to link file with extension " + f".{ext}" + ) + self.add_file(path_or_code, kind) + return + else: + # Otherwise, we should have been given a LinkableCode object + if not isinstance(path_or_code, LinkableCode): + raise TypeError( + "Expected path to file or a LinkableCode object" + ) + + if path_or_code.kind == "cu": + self.add_cu(path_or_code.data, path_or_code.name) + else: + self.add_data( + path_or_code.data, path_or_code.kind, path_or_code.name + ) @abstractmethod def complete(self): @@ -3017,36 +3036,6 @@ def add_ltoir(self, ltoir, name=""): def add_object(self, obj, name=""): self._linker.add_object(obj, name) - def add_file_guess_ext(self, path_or_code): - # Numba's add_file_guess_ext expects to always be passed a path to a - # file that it will load from the filesystem to link. We augment it - # here with the ability to provide a file from memory. - - # To maintain compatibility with the original interface, all strings - # are treated as paths in the filesystem. - if isinstance(path_or_code, str): - # Upstream numba does not yet recognize LTOIR, so handle that - # separately here. - extension = pathlib.Path(path_or_code).suffix - if extension == ".ltoir": - self.add_file(path_or_code, "ltoir") - else: - # Use Numba's logic for non-LTOIR - super().add_file_guess_ext(path_or_code) - - return - - # Otherwise, we should have been given a LinkableCode object - if not isinstance(path_or_code, LinkableCode): - raise TypeError("Expected path to file or a LinkableCode object") - - if path_or_code.kind == "cu": - self.add_cu(path_or_code.data, path_or_code.name) - else: - self.add_data( - path_or_code.data, path_or_code.kind, path_or_code.name - ) - def add_file(self, path, kind): try: with open(path, "rb") as f: diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py index 3431cf7..917dbb1 100644 --- a/numba_cuda/numba/cuda/cudadrv/enums.py +++ b/numba_cuda/numba/cuda/cudadrv/enums.py @@ -309,7 +309,10 @@ # Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY CU_JIT_INPUT_LIBRARY = 4 -CU_JIT_NUM_INPUT_TYPES = 6 +# LTO IR +CU_JIT_INPUT_LTO_IR = 5 + +CU_JIT_NUM_INPUT_TYPES = 7 # Online compiler and linker options diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index 3750324..15fe79b 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -19,4 +19,5 @@ 'lib': enums.CU_JIT_INPUT_LIBRARY, 'cubin': enums.CU_JIT_INPUT_CUBIN, 'fatbin': enums.CU_JIT_INPUT_FATBINARY, + 'ltoir': enums.CU_JIT_INPUT_LTO_IR, } From c3b908403dd9c99ef41b1b5b17cddb9819a7bc41 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Aug 2024 06:50:26 -0700 Subject: [PATCH 13/52] refactor __new__ logic --- numba_cuda/numba/cuda/cudadrv/driver.py | 44 ++++++++++++++++++------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index fe46630..67603c4 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2598,21 +2598,41 @@ def new(cls, lto=None, additional_flags=None ): - if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: - from numba.cuda.cudadrv import runtime - driver_ver, runtime_ver = ( - driver.get_version(), runtime.get_version() + + driver_ver = driver.get_version() + if ( + config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY + and driver_ver >= (12, 0) + ): + raise ValueError( + "Use ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC" + ) + if config.ENABLE_PYNVJITLINK and driver_ver < (12, 0): + raise ValueError( + "Use CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY " + "for CUDA < 12.0 MVC" + ) + if ( + config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY + and config.ENABLE_PYNVJITLINK + ): + raise ValueError( + "can't set both config.ENABLE_PYNVJITLINK " + "and config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY " + "at the same time" ) - if driver_ver >= (12, 0) and runtime_ver > driver_ver: - # runs once - linker = PyNvJitLinker - else: - linker = MVCLinker - elif USE_NV_BINDING: - linker = CudaPythonLinker + if config.ENABLE_PYNVJITLINK: + linker = PyNvJitLinker + + elif config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: + linker = MVCLinker else: - linker = CtypesLinker + if USE_NV_BINDING: + linker = CudaPythonLinker + else: + linker = CtypesLinker + if linker is PyNvJitLinker: return linker(max_registers, lineinfo, cc, lto, additional_flags) elif additional_flags or lto: From 2c940ee07c41f2baf6803256bcc14969a8663f4b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Aug 2024 06:59:30 -0700 Subject: [PATCH 14/52] address reviews --- numba_cuda/numba/cuda/cudadrv/driver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 67603c4..9315343 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -82,7 +82,6 @@ def _readenv(name, ctor, default): return default -config.ENABLE_PYNVJITLINK = False if _readenv("ENABLE_PYNVJITLINK", bool, False): config.ENABLE_PYNVJITLINK = True @@ -93,7 +92,7 @@ def _readenv(name, ctor, default): ) _MVC_ERROR_MESSAGE_CU12 = ( - "Minor version compatibility requires pynvjitlink package to be available" + "Using pynvjitlink requires the pynvjitlink package to be available" ) From a8c38b6613985e9ca90d4a11c299094b18e8e82b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 21 Aug 2024 12:33:27 -0700 Subject: [PATCH 15/52] refactor config logic --- numba_cuda/numba/cuda/cudadrv/driver.py | 33 +++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 9315343..9efac41 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -58,11 +58,7 @@ _py_decref.argtypes = [ctypes.py_object] _py_incref.argtypes = [ctypes.py_object] -pynvjitlink_import_err = None -try: - from pynvjitlink.api import NvJitLinker, NvJitLinkError -except ImportError as err: - pynvjitlink_import_err = err +_MVC_ERROR_MESSAGE_CU12 = None def _readenv(name, ctor, default): @@ -82,10 +78,6 @@ def _readenv(name, ctor, default): return default -if _readenv("ENABLE_PYNVJITLINK", bool, False): - config.ENABLE_PYNVJITLINK = True - - _MVC_ERROR_MESSAGE_CU11 = ( "Minor version compatibility requires ptxcompiler and cubinlinker packages " "to be available" @@ -95,6 +87,23 @@ def _readenv(name, ctor, default): "Using pynvjitlink requires the pynvjitlink package to be available" ) +ENABLE_PYNVJITLINK = ( + _readenv("ENABLE_PYNVJITLINK", bool, False) + or config.ENABLE_PYNVJITLINK +) + +if ENABLE_PYNVJITLINK: + try: + from pynvjitlink.api import NvJitLinker, NvJitLinkError + except ImportError: + raise ImportError(_MVC_ERROR_MESSAGE_CU12) + + if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: + raise ValueError( + "Can't set CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and " + "ENABLE_PYNVJITLINK at the same time" + ) + def make_logger(): logger = logging.getLogger(__name__) @@ -2608,8 +2617,7 @@ def new(cls, ) if config.ENABLE_PYNVJITLINK and driver_ver < (12, 0): raise ValueError( - "Use CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY " - "for CUDA < 12.0 MVC" + "Enabling pynvjitlink requires CUDA 12." ) if ( config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY @@ -3012,8 +3020,7 @@ def __init__( lto=False, additional_flags=None, ): - if pynvjitlink_import_err is not None: - raise ImportError(_MVC_ERROR_MESSAGE_CU12) + if cc is None: raise RuntimeError("PyNvJitLinker requires CC to be specified") if not any(isinstance(cc, t) for t in [list, tuple]): From 421fdfbefd01acc86022aa15344cab7f90ed38b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2024 02:16:52 -0700 Subject: [PATCH 16/52] continue addressing reviews --- numba_cuda/numba/cuda/cudadrv/driver.py | 24 +++++++----------------- numba_cuda/numba/cuda/cudadrv/enums.py | 2 +- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 9efac41..6a16403 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2695,7 +2695,11 @@ def add_cu_file(self, path): self.add_cu(cu, os.path.basename(path)) def add_file_guess_ext(self, path_or_code): - """Add a file to the link, guessing its type from its extension.""" + """ + Add a file or LinkableCode object to the link. If a file is + passed, the type will be inferred from the extension. A LinkableCode + object represents a file already in memory. + """ if isinstance(path_or_code, str): ext = pathlib.Path(path_or_code).suffix if ext == '': @@ -2704,6 +2708,8 @@ def add_file_guess_ext(self, path_or_code): ) elif ext == '.cu': self.add_cu_file(path_or_code) + elif ext == ".ltoir": + self.add_file(path_or_code, "ltoir") else: kind = FILE_EXTENSION_MAP.get(ext, None) if kind is None: @@ -3093,22 +3099,6 @@ def add_data(self, data, kind, name): except NvJitLinkError as e: raise LinkerError from e - def add_cu(self, cu, name): - with driver.get_active_context() as ac: - dev = driver.get_device(ac.devnum) - cc = dev.compute_capability - - ptx, log = nvrtc.compile(cu, name, cc) - - if config.DUMP_ASSEMBLY: - print(("ASSEMBLY %s" % name).center(80, "-")) - print(ptx) - print("=" * 80) - - # Link the program's PTX using the normal linker mechanism - ptx_name = os.path.splitext(name)[0] + ".ptx" - self.add_ptx(ptx.encode(), ptx_name) - def complete(self): try: cubin = self._linker.get_linked_cubin() diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py index 917dbb1..25bbbe1 100644 --- a/numba_cuda/numba/cuda/cudadrv/enums.py +++ b/numba_cuda/numba/cuda/cudadrv/enums.py @@ -312,7 +312,7 @@ # LTO IR CU_JIT_INPUT_LTO_IR = 5 -CU_JIT_NUM_INPUT_TYPES = 7 +CU_JIT_NUM_INPUT_TYPES = 6 # Online compiler and linker options From 16314a76cb3366f8f215521c56efaa29fe27b8f4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2024 02:19:28 -0700 Subject: [PATCH 17/52] rename errors --- numba_cuda/numba/cuda/cudadrv/driver.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 6a16403..a1a5c0c 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -58,7 +58,7 @@ _py_decref.argtypes = [ctypes.py_object] _py_incref.argtypes = [ctypes.py_object] -_MVC_ERROR_MESSAGE_CU12 = None +_PYNVJITLINK_ERR_MSG = None def _readenv(name, ctor, default): @@ -78,12 +78,12 @@ def _readenv(name, ctor, default): return default -_MVC_ERROR_MESSAGE_CU11 = ( +_MVC_ERROR_MESSAGE = ( "Minor version compatibility requires ptxcompiler and cubinlinker packages " "to be available" ) -_MVC_ERROR_MESSAGE_CU12 = ( +_PYNVJITLINK_ERR_MSG = ( "Using pynvjitlink requires the pynvjitlink package to be available" ) @@ -96,7 +96,7 @@ def _readenv(name, ctor, default): try: from pynvjitlink.api import NvJitLinker, NvJitLinkError except ImportError: - raise ImportError(_MVC_ERROR_MESSAGE_CU12) + raise ImportError(_PYNVJITLINK_ERR_MSG) if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: raise ValueError( @@ -2751,7 +2751,7 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None): try: from cubinlinker import CubinLinker except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err + raise ImportError(_MVC_ERROR_MESSAGE) from err if cc is None: raise RuntimeError("MVCLinker requires Compute Capability to be " @@ -2783,7 +2783,7 @@ def add_ptx(self, ptx, name=''): from ptxcompiler import compile_ptx from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err + raise ImportError(_MVC_ERROR_MESSAGE) from err compile_result = compile_ptx(ptx.decode(), self.ptx_compile_options) try: self._linker.add_cubin(compile_result.compiled_program, name) @@ -2794,7 +2794,7 @@ def add_file(self, path, kind): try: from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err + raise ImportError(_MVC_ERROR_MESSAGE) from err try: with open(path, 'rb') as f: @@ -2823,7 +2823,7 @@ def complete(self): try: from cubinlinker import CubinLinkerError except ImportError as err: - raise ImportError(_MVC_ERROR_MESSAGE_CU11) from err + raise ImportError(_MVC_ERROR_MESSAGE) from err try: return self._linker.complete() From 41d85a9254076887ec14ea6e0ea7edc943956601 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2024 03:31:25 -0700 Subject: [PATCH 18/52] minor cleanup --- numba_cuda/numba/cuda/cudadrv/driver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index a1a5c0c..483df13 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -58,8 +58,6 @@ _py_decref.argtypes = [ctypes.py_object] _py_incref.argtypes = [ctypes.py_object] -_PYNVJITLINK_ERR_MSG = None - def _readenv(name, ctor, default): value = os.environ.get(name) @@ -83,7 +81,7 @@ def _readenv(name, ctor, default): "to be available" ) -_PYNVJITLINK_ERR_MSG = ( +_PYNVJITLINK_ERR_MESSAGE = ( "Using pynvjitlink requires the pynvjitlink package to be available" ) @@ -96,7 +94,7 @@ def _readenv(name, ctor, default): try: from pynvjitlink.api import NvJitLinker, NvJitLinkError except ImportError: - raise ImportError(_PYNVJITLINK_ERR_MSG) + raise ImportError(_PYNVJITLINK_ERR_MESSAGE) if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: raise ValueError( From f7939b66a49a1c39b3ad3ac707ee745d68c13b99 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 22 Aug 2024 07:10:30 -0500 Subject: [PATCH 19/52] Apply suggestions from code review Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com> --- numba_cuda/numba/cuda/cudadrv/driver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 483df13..45f3c72 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2640,7 +2640,7 @@ def new(cls, if linker is PyNvJitLinker: return linker(max_registers, lineinfo, cc, lto, additional_flags) - elif additional_flags or lto: + elif additional_flags is not None or lto is not None: raise ValueError("LTO and additional flags require PyNvJitLinker") else: return linker(max_registers, lineinfo, cc) @@ -2706,8 +2706,6 @@ def add_file_guess_ext(self, path_or_code): ) elif ext == '.cu': self.add_cu_file(path_or_code) - elif ext == ".ltoir": - self.add_file(path_or_code, "ltoir") else: kind = FILE_EXTENSION_MAP.get(ext, None) if kind is None: @@ -3087,7 +3085,7 @@ def add_data(self, data, kind, name): return self.add_ptx(data, name) elif kind == FILE_EXTENSION_MAP["o"]: fn = self._linker.add_object - elif kind == "ltoir": + elif kind == FILE_EXTENSION_MAP["ltoir"]: fn = self._linker.add_ltoir else: raise LinkerError(f"Don't know how to link {kind}") From 91f06a89f15161eaa2c0f4fcf563a53fcc4dada6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2024 05:40:49 -0700 Subject: [PATCH 20/52] address reviews --- numba_cuda/numba/cuda/cudadrv/driver.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 45f3c72..1a633db 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -81,20 +81,22 @@ def _readenv(name, ctor, default): "to be available" ) -_PYNVJITLINK_ERR_MESSAGE = ( - "Using pynvjitlink requires the pynvjitlink package to be available" -) ENABLE_PYNVJITLINK = ( _readenv("ENABLE_PYNVJITLINK", bool, False) - or config.ENABLE_PYNVJITLINK + or getattr(config, "ENABLE_PYNVJITLINK", None) ) +if not hasattr(config, "ENABLE_PYNVJITLINK"): + config.ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK + if ENABLE_PYNVJITLINK: try: from pynvjitlink.api import NvJitLinker, NvJitLinkError except ImportError: - raise ImportError(_PYNVJITLINK_ERR_MESSAGE) + raise ImportError( + "Using pynvjitlink requires the pynvjitlink package to be available" + ) if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: raise ValueError( @@ -2617,16 +2619,6 @@ def new(cls, raise ValueError( "Enabling pynvjitlink requires CUDA 12." ) - if ( - config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY - and config.ENABLE_PYNVJITLINK - ): - raise ValueError( - "can't set both config.ENABLE_PYNVJITLINK " - "and config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY " - "at the same time" - ) - if config.ENABLE_PYNVJITLINK: linker = PyNvJitLinker From 0541dcfb919f141ed6f8697f7173a0e3f0d87270 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2024 05:55:11 -0700 Subject: [PATCH 21/52] bug fixes and map ltoir to CU_JIT_INPUT_NVVM --- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- numba_cuda/numba/cuda/cudadrv/mappings.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 1a633db..108b92b 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2691,7 +2691,7 @@ def add_file_guess_ext(self, path_or_code): object represents a file already in memory. """ if isinstance(path_or_code, str): - ext = pathlib.Path(path_or_code).suffix + ext = pathlib.Path(path_or_code).suffix.lstrip('.') if ext == '': raise RuntimeError( "Don't know how to link file with no extension" diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index 15fe79b..a552af1 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -10,6 +10,7 @@ 'lib': jitty.CU_JIT_INPUT_LIBRARY, 'cubin': jitty.CU_JIT_INPUT_CUBIN, 'fatbin': jitty.CU_JIT_INPUT_FATBINARY, + 'ltoir': jitty.CU_JIT_INPUT_NVVM, } else: FILE_EXTENSION_MAP = { From 710f8cb16395d6fe8c566091658dfd65c5a1cb80 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 4 Sep 2024 05:02:44 -0700 Subject: [PATCH 22/52] CU_JIT_INPUT_LTO_IR -> CU_JIT_INPUT_NVVM --- numba_cuda/numba/cuda/cudadrv/enums.py | 2 +- numba_cuda/numba/cuda/cudadrv/mappings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py index 25bbbe1..0b1b155 100644 --- a/numba_cuda/numba/cuda/cudadrv/enums.py +++ b/numba_cuda/numba/cuda/cudadrv/enums.py @@ -310,7 +310,7 @@ CU_JIT_INPUT_LIBRARY = 4 # LTO IR -CU_JIT_INPUT_LTO_IR = 5 +CU_JIT_INPUT_NVVM = 5 CU_JIT_NUM_INPUT_TYPES = 6 diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index a552af1..541b04c 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -20,5 +20,5 @@ 'lib': enums.CU_JIT_INPUT_LIBRARY, 'cubin': enums.CU_JIT_INPUT_CUBIN, 'fatbin': enums.CU_JIT_INPUT_FATBINARY, - 'ltoir': enums.CU_JIT_INPUT_LTO_IR, + 'ltoir': enums.CU_JIT_INPUT_NVVM, } From a8cb6c223e07cea44333f7a99d022c488e0fca71 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 4 Sep 2024 05:03:37 -0700 Subject: [PATCH 23/52] only use cuda if CUDA_USE_NVIDIA_BINDING --- numba_cuda/numba/cuda/cudadrv/mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index 541b04c..95d369e 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -1,7 +1,7 @@ from numba import config -from cuda import cuda from . import enums if config.CUDA_USE_NVIDIA_BINDING: + from cuda import cuda jitty = cuda.CUjitInputType FILE_EXTENSION_MAP = { 'o': jitty.CU_JIT_INPUT_OBJECT, From b8b671f3a6016939f67e4f5a991666d2a8ec67ed Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 4 Sep 2024 06:21:04 -0700 Subject: [PATCH 24/52] fixes --- numba_cuda/numba/cuda/cudadrv/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 108b92b..225e0df 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2691,7 +2691,7 @@ def add_file_guess_ext(self, path_or_code): object represents a file already in memory. """ if isinstance(path_or_code, str): - ext = pathlib.Path(path_or_code).suffix.lstrip('.') + ext = pathlib.Path(path_or_code).suffix if ext == '': raise RuntimeError( "Don't know how to link file with no extension" @@ -2699,11 +2699,11 @@ def add_file_guess_ext(self, path_or_code): elif ext == '.cu': self.add_cu_file(path_or_code) else: - kind = FILE_EXTENSION_MAP.get(ext, None) + kind = FILE_EXTENSION_MAP.get(ext.lstrip('.'), None) if kind is None: raise RuntimeError( "Don't know how to link file with extension " - f".{ext}" + f"{ext}" ) self.add_file(path_or_code, kind) return From 7c384a3b202f8e2e93514edb48d5c372b70a5f74 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 26 Sep 2024 09:27:36 -0700 Subject: [PATCH 25/52] tests --- .gitignore | 2 + numba_cuda/numba/cuda/codegen.py | 18 +- numba_cuda/numba/cuda/dispatcher.py | 4 +- numba_cuda/numba/cuda/target.py | 6 +- .../numba/cuda/tests/cudadrv/test_linker.py | 5 - .../cuda/tests/cudadrv/test_nvjitlink.py | 181 ++++++++++++++++++ .../tests/test_binary_generation/Makefile | 54 ++++++ .../generate_raw_ltoir.py | 163 ++++++++++++++++ .../test_device_functions.cu | 19 ++ .../undefined_extern.cu | 3 + 10 files changed, 442 insertions(+), 13 deletions(-) create mode 100644 numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/Makefile create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu create mode 100644 numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu diff --git a/.gitignore b/.gitignore index 55b4f8b..99271f3 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycache__ build .*.swp *.so +numba_cuda/numba/cuda/tests/cudadrv/test_device_functions.* +numba_cuda/numba/cuda/tests/cudadrv/undefined_extern.* diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 6009dcb..0b25444 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -59,11 +59,18 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary): get_cufunc), which may be of different compute capabilities. """ - def __init__(self, codegen, name, entry_name=None, max_registers=None, - nvvm_options=None): + def __init__( + self, + codegen, + name, + entry_name=None, + max_registers=None, + lto=False, + nvvm_options=None + ): """ codegen: - Codegen object. + Codegen object. name: Name of the function in the source. entry_name: @@ -103,6 +110,7 @@ def __init__(self, codegen, name, entry_name=None, max_registers=None, self._cufunc_cache = {} self._max_registers = max_registers + self._lto = lto if nvvm_options is None: nvvm_options = {} self._nvvm_options = nvvm_options @@ -178,7 +186,9 @@ def get_cubin(self, cc=None): if cubin: return cubin - linker = driver.Linker.new(max_registers=self._max_registers, cc=cc) + linker = driver.Linker.new( + max_registers=self._max_registers, cc=cc, lto=self._lto + ) if linker.lto: ltoir = self.get_ltoir(cc=cc) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 16680a2..332fe15 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -46,7 +46,7 @@ class _Kernel(serialize.ReduceMixin): @global_compiler_lock def __init__(self, py_func, argtypes, link=None, debug=False, lineinfo=False, inline=False, fastmath=False, extensions=None, - max_registers=None, opt=True, device=False): + max_registers=None, lto=False, opt=True, device=False): if device: raise RuntimeError('Cannot compile a device function as a kernel') @@ -94,7 +94,7 @@ def __init__(self, py_func, argtypes, link=None, debug=False, lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc, debug, lineinfo, nvvm_options, filename, linenum, - max_registers) + max_registers, lto) if not link: link = [] diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 6402ff3..98d4311 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -148,7 +148,7 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None): def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo, nvvm_options, filename, linenum, - max_registers=None): + max_registers=None, lto=False): """ Adapt a code library ``codelib`` with the numba compiled CUDA kernel with name ``fname`` and arguments ``argtypes`` for NVVM. @@ -175,7 +175,9 @@ def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo, library = self.codegen().create_library(f'{codelib.name}_kernel_', entry_name=kernel_name, nvvm_options=nvvm_options, - max_registers=max_registers) + max_registers=max_registers, + lto=lto + ) library.add_linking_library(codelib) wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name, debug, lineinfo, filename, diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index 22e2ee8..31d142d 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -1,6 +1,5 @@ import numpy as np import warnings -from numba.cuda.testing import unittest from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing) from numba.cuda.testing import CUDATestCase, test_data_dir from numba.cuda.cudadrv.driver import (CudaAPIError, Linker, @@ -311,7 +310,3 @@ def test_get_local_mem_per_specialized(self): local_mem_size = compiled_specialized.get_local_mem_per_thread() calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE self.assertGreaterEqual(local_mem_size, calc_size) - - -if __name__ == '__main__': - unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py new file mode 100644 index 0000000..e528e7f --- /dev/null +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -0,0 +1,181 @@ +from numba.cuda.testing import unittest +from numba.cuda.testing import skip_on_cudasim +from numba.cuda.testing import CUDATestCase +from numba.cuda.cudadrv.driver import PyNvJitLinker + +import itertools +from pynvjitlink.api import NvJitLinkError +from numba.cuda import get_current_device +from numba import cuda +from numba import config + +HAVE_PYNVJITLINK = False +try: + import pynvjitlink # noqa: F401 + + HAVE_PYNVJITLINK = True +except ImportError: + pass + + +@unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available") +@skip_on_cudasim("Linking unsupported in the simulator") +class TestLinker(CUDATestCase): + _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"} + + def test_nvjitlink_create(self): + patched_linker = PyNvJitLinker(cc=(7, 5)) + assert "-arch=sm_75" in patched_linker.options + + def test_nvjitlink_create_no_cc_error(self): + # nvJitLink expects at least the architecture to be specified. + with self.assertRaisesRegex( + RuntimeError, "PyNvJitLinker requires CC to be specified" + ): + PyNvJitLinker() + + def test_nvjitlink_invalid_arch_error(self): + # CC 0.0 is not a valid compute capability + with self.assertRaisesRegex( + NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error" + ): + PyNvJitLinker(cc=(0, 0)) + + def test_nvjitlink_invalid_cc_type_error(self): + with self.assertRaisesRegex( + TypeError, "`cc` must be a list or tuple of length 2" + ): + PyNvJitLinker(cc=0) + + def test_nvjitlink_ptx_compile_options(self): + + max_registers = (None, 32) + lineinfo = (False, True) + lto = (False, True) + additional_flags = (None, ("-g",), ("-g", "-time")) + for ( + max_registers_i, + line_info_i, + lto_i, + additional_flags_i, + ) in itertools.product(max_registers, lineinfo, lto, additional_flags): + with self.subTest( + max_registers=max_registers_i, + lineinfo=line_info_i, + lto=lto_i, + additional_flags=additional_flags_i, + ): + patched_linker = PyNvJitLinker( + cc=(7, 5), + max_registers=max_registers_i, + lineinfo=line_info_i, + lto=lto_i, + additional_flags=additional_flags_i, + ) + assert "-arch=sm_75" in patched_linker.options + + if max_registers_i: + assert ( + f"-maxrregcount={max_registers_i}" + in patched_linker.options + ) + else: + assert "-maxrregcount" not in patched_linker.options + + if line_info_i: + assert "-lineinfo" in patched_linker.options + else: + assert "-lineinfo" not in patched_linker.options + + if lto_i: + assert "-lto" in patched_linker.options + else: + assert "-lto" not in patched_linker.options + + if additional_flags_i: + for flag in additional_flags_i: + assert flag in patched_linker.options + + def test_nvjitlink_add_file_guess_ext_linkable_code(self): + files = ( + "test_device_functions.a", + "test_device_functions.cubin", + "test_device_functions.cu", + "test_device_functions.fatbin", + "test_device_functions.o", + "test_device_functions.ptx", + ) + for file in files: + with self.subTest(file=file): + patched_linker = PyNvJitLinker( + cc=get_current_device().compute_capability + ) + patched_linker.add_file_guess_ext(file) + + def test_nvjitlink_test_add_file_guess_ext_invalid_input(self): + with open("test_device_functions.cubin", "rb") as f: + content = f.read() + + patched_linker = PyNvJitLinker( + cc=get_current_device().compute_capability + ) + with self.assertRaisesRegex( + TypeError, "Expected path to file or a LinkableCode" + ): + # Feeding raw data as bytes to add_file_guess_ext should raise, + # because there's no way to know what kind of file to treat it as + patched_linker.add_file_guess_ext(content) + + @unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available") + def test_nvjitlink_jit_with_linkable_code(self): + files = ( + "test_device_functions.a", + "test_device_functions.cubin", + "test_device_functions.cu", + "test_device_functions.fatbin", + "test_device_functions.o", + "test_device_functions.ptx", + ) + for file in files: + with self.subTest(file=file): + # TODO: unsafe teardown if test errors + config.ENABLE_PYNVJITLINK = True + sig = "uint32(uint32, uint32)" + add_from_numba = cuda.declare_device("add_from_numba", sig) + + @cuda.jit(link=[file]) + def kernel(result): + result[0] = add_from_numba(1, 2) + + result = cuda.device_array(1) + kernel[1, 1](result) + assert result[0] == 3 + + def test_nvjitlink_jit_with_linkable_code_lto(self): + file = "test_device_functions.ltoir" + + sig = "uint32(uint32, uint32)" + add_from_numba = cuda.declare_device("add_from_numba", sig) + + @cuda.jit(link=[file], lto=True) + def kernel(result): + result[0] = add_from_numba(1, 2) + + result = cuda.device_array(1) + kernel[1, 1](result) + assert result[0] == 3 + + def test_nvjitlink_jit_with_invalid_linkable_code(self): + with open("test_device_functions.cubin", "rb") as f: + content = f.read() + with self.assertRaisesRegex( + TypeError, "Expected path to file or a LinkableCode" + ): + + @cuda.jit("void()", link=[content]) + def kernel(): + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile new file mode 100644 index 0000000..adde5e3 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile @@ -0,0 +1,54 @@ +# Generates the input files used by the pynvjitlink binding test suite + +# Test binaries are built taking into account the CC of the GPU in the test machine +GPU_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv | grep -v compute_cap | head -n 1 | sed 's/\.//') +GPU_CC ?= 75 + +# Use CC 7.0 as an alternative in fatbin testing, unless CC is 7.x +ifeq ($(shell echo "$(GPU_CC)" | cut -c1),7) + ALT_CC := 80 +else + ALT_CC := 70 +endif + +# Gencode flags suitable for most tests +GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC) + +# Fatbin tests need to generate code for an additional compute capability +FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC) + +# LTO-IR tests need to generate for the LTO "architecture" instead +LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC) + +# Compile with optimization; use relocatable device code to preserve device +# functions in the final output +NVCC_FLAGS := -O3 -rdc true + +# Flags specific to output type +CUBIN_FLAGS := $(GENCODE) --cubin +PTX_FLAGS := $(GENCODE) -ptx +OBJECT_FLAGS := $(GENCODE) -dc +LIBRARY_FLAGS := $(GENCODE) -lib +FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin +LTOIR_FLAGS := $(LTOIR_GENCODE) -dc + +OUTPUT_DIR := ../cudadrv + +all: + @echo "GPU CC: $(GPU_CC)" + @echo "Alternative CC: $(ALT_CC)" + # Compile all test objects + nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu + nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu + nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu + nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu + nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu + nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu + + # Generate LTO-IR wrapped in a fatbin + nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu + # Generate LTO-IR in a "raw" LTO-IR container + python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu + # We also want to test linking a .cu file; this needs no compilation, + # so copy it instead + cp test_device_functions.cu $(OUTPUT_DIR) diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py new file mode 100644 index 0000000..8e1d6ec --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py @@ -0,0 +1,163 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import argparse +import pathlib +import subprocess +import sys + +from cuda import nvrtc + +# Magic number found at the start of an LTO-IR file +LTOIR_MAGIC = 0x7F4E43ED + + +def check(args): + """ + Abort and print an error message in the presence of an error result. + + Otherwise: + - Return None if there were no more arguments, + - Return the singular argument if there was only one further argument, + - Return the tuple of arguments if multiple followed. + """ + + result, *args = args + value = result.value + + if value: + error_string = check(nvrtc.nvrtcGetErrorString(result)).decode() + msg = f"NVRTC error, code {value}: {error_string}" + print(msg, file=sys.stderr) + sys.exit(1) + + if len(args) == 0: + return None + elif len(args) == 1: + return args[0] + else: + return args + + +def determine_include_flags(): + # Inspired by the logic in FindCUDAToolkit.cmake. We need the CUDA include + # paths because NVRTC doesn't add them by default, and we can compile a + # much broader set of test files if the CUDA includes are available. + + # We invoke NVCC in verbose mode ("-v") and give a dummy filename, without + # which it won't produce output. + + cmd = ["nvcc", "-v", "__dummy"] + cp = subprocess.run(cmd, capture_output=True) + + # Since the dummy file doesn't actually exist, NVCC is expected to exit + # with an error code of 1. + rc = cp.returncode + if rc != 1: + print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.") + return None + + output = cp.stderr.decode() + lines = output.splitlines() + + includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")] + if len(includes_lines) != 1: + print(f"Expected exactly one INCLUDES line. Got {len(includes_lines)}.") + return None + + # Parse out the arguments following "INCLUDES=" - these are a space + # separated list of strings that are potentially quoted. + + quoted_flags = includes_lines[0].split("INCLUDES=")[1].strip().split() + include_flags = [flag.strip('"') for flag in quoted_flags] + print(f"Using CUDA include flags: {include_flags}") + + return include_flags + + +def get_ltoir(source, name, arch): + """Given a CUDA C/C++ source, compile it and return the LTO-IR.""" + + program = check( + nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], []) + ) + + cuda_include_flags = determine_include_flags() + if cuda_include_flags is None: + print("Error determining CUDA include flags. Exiting.", file=sys.stderr) + sys.exit(1) + + options = [ + f"--gpu-architecture={arch}", + "-dlto", + "-rdc", + "true", + *cuda_include_flags, + ] + options = [o.encode() for o in options] + + result = nvrtc.nvrtcCompileProgram(program, len(options), options) + + # Report compilation errors back to the user + if result[0] == nvrtc.nvrtcResult.NVRTC_ERROR_COMPILATION: + log_size = check(nvrtc.nvrtcGetProgramLogSize(program)) + log = b" " * log_size + check(nvrtc.nvrtcGetProgramLog(program, log)) + print("NVRTC compilation error:\n", file=sys.stderr) + print(log.decode(), file=sys.stderr) + sys.exit(1) + + # Handle other errors in the standard way + check(result) + + ltoir_size = check(nvrtc.nvrtcGetLTOIRSize(program)) + ltoir = b" " * ltoir_size + check(nvrtc.nvrtcGetLTOIR(program, ltoir)) + + # Check that the output looks like an LTO-IR container + header = int.from_bytes(ltoir[:4], byteorder="little") + if header != LTOIR_MAGIC: + print( + f"Unexpected header value 0x{header:X}.\n" + f"Expected LTO-IR magic number 0x{LTOIR_MAGIC:X}." + "\nExiting.", + file=sys.stderr, + ) + sys.exit(1) + + return ltoir + + +def main(sourcepath, outputpath, arch): + with open(sourcepath) as f: + source = f.read() + + name = pathlib.Path(sourcepath).name + ltoir = get_ltoir(source, name, arch) + + print(f"Writing {outputpath}...") + + with open(outputpath, "wb") as f: + f.write(ltoir) + + +if __name__ == "__main__": + description = "Compiles CUDA C/C++ to LTO-IR using NVRTC." + parser = argparse.ArgumentParser(description=description) + parser.add_argument("sourcepath", help="path to source file") + parser.add_argument( + "-o", "--output", help="path to output file", default=None + ) + parser.add_argument( + "-a", + "--arch", + help="compute arch to target (e.g. sm_87). " "Defaults to sm_50.", + default="sm_50", + ) + + args = parser.parse_args() + outputpath = args.output + + if outputpath is None: + outputpath = pathlib.Path(args.sourcepath).with_suffix(".ltoir") + + main(args.sourcepath, outputpath, args.arch) diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu b/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu new file mode 100644 index 0000000..f1499dc --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu @@ -0,0 +1,19 @@ +#include + +extern __device__ bool __heq(__half arg1, __half arg2); + +__device__ __half test_add_fp16(__half arg1, __half arg2) { + return __hadd(arg1, arg2); +} + +__device__ bool test_cmp_fp16(__half arg1, __half arg2) { + return __heq(arg1, arg2); +} + +typedef unsigned int uint32_t; + +extern "C" __device__ int add_from_numba(uint32_t *result, uint32_t a, + uint32_t b) { + *result = a + b; + return 0; +} diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu b/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu new file mode 100644 index 0000000..9e0a0ca --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu @@ -0,0 +1,3 @@ +extern __device__ float undef(float a, float b); + +__global__ void f(float *r, float *a, float *b) { r[0] = undef(a[0], b[0]); } From 6327ec2752412667019df0c6da2dd99d7a2f9293 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Sep 2024 12:07:27 -0700 Subject: [PATCH 26/52] fix bug --- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 225e0df..d33c716 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2632,7 +2632,7 @@ def new(cls, if linker is PyNvJitLinker: return linker(max_registers, lineinfo, cc, lto, additional_flags) - elif additional_flags is not None or lto is not None: + elif additional_flags is not None or lto is True: raise ValueError("LTO and additional flags require PyNvJitLinker") else: return linker(max_registers, lineinfo, cc) From c97767c7a0ac21341e981ef50165aeaa87dd23a7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Sep 2024 12:31:24 -0700 Subject: [PATCH 27/52] add a new ci job for testing with pynvjitlink --- .github/workflows/pr.yaml | 25 +++++++++++++++++++++++-- ci/test_conda.sh | 9 ++++++++- ci/test_wheel.sh | 9 ++++++++- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7498eb1..d4c54d5 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -54,9 +54,20 @@ jobs: uses: ./.github/workflows/conda-python-tests.yaml with: build_type: pull-request - script: "ci/test_conda.sh" + script: "ci/test_conda.sh false" run_codecov: false matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} + test-conda-pynvjitlink: + needs: + - build-conda + - compute-matrix + uses: ./.github/workflows/conda-python-tests.yaml + with: + build_type: pull-request + script: "ci/test_conda.sh true" + run_codecov: false + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build-wheels: needs: - compute-matrix @@ -72,8 +83,18 @@ jobs: uses: ./.github/workflows/wheels-test.yaml with: build_type: pull-request - script: "ci/test_wheel.sh" + script: "ci/test_wheel.sh false" matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} + test-wheels-pynvjitlink: + needs: + - build-wheels + - compute-matrix + uses: ./.github/workflows/wheels-test.yaml + with: + build_type: pull-request + script: "ci/test_wheel.sh true" + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build-docs: needs: - build-conda diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 5861128..a932630 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -3,6 +3,8 @@ set -euo pipefail +USE_PYNVJITLINK = $1 + . /opt/conda/etc/profile.d/conda.sh rapids-logger "Install testing dependencies" @@ -42,8 +44,13 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +if [ "$USE_PYNVJITLINK" == true ]; then + rapids-logger "Install pynvjitlink" + conda install -c rapidsai pynvjitlink +fi + rapids-logger "Run Tests" -python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 5ffd014..7121517 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -3,6 +3,8 @@ set -euo pipefail +USE_PYNVJITLINK = $1 + rapids-logger "Install testing dependencies" # TODO: Replace with rapids-dependency-file-generator python -m pip install \ @@ -10,6 +12,11 @@ python -m pip install \ cuda-python \ pytest +if [ "$USE_PYNVJITLINK" == true ]; then + rapids-logger "Install pynvjitlink" + python -m pip install pynvjitlink +fi + rapids-logger "Install wheel" package=$(realpath wheel/numba_cuda*.whl) echo "Package path: $package" @@ -26,6 +33,6 @@ rapids-logger "Show Numba system info" python -m numba --sysinfo rapids-logger "Run Tests" -python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v popd From aa3aaf7af3668e71615ed58119c9d94480398430 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Sep 2024 12:34:54 -0700 Subject: [PATCH 28/52] fixes --- .github/workflows/pr.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d4c54d5..9b806ce 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,8 +19,10 @@ jobs: - compute-matrix - build-conda - test-conda + - test-conda-pynvjitlink - build-wheels - test-wheels + - test-wheels-pynvjitlink - build-docs secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 From f01c0d6bd630f08065f7f4d7c642d0c76ee5ab60 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Oct 2024 10:36:28 -0700 Subject: [PATCH 29/52] more small fixes --- ci/test_conda.sh | 2 +- ci/test_wheel.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index a932630..eb423ab 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -3,7 +3,7 @@ set -euo pipefail -USE_PYNVJITLINK = $1 +USE_PYNVJITLINK=$1 . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 7121517..734cedb 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -3,7 +3,7 @@ set -euo pipefail -USE_PYNVJITLINK = $1 +USE_PYNVJITLINK=$1 rapids-logger "Install testing dependencies" # TODO: Replace with rapids-dependency-file-generator From 519f0c15255286985033c7db78c32d7521a82595 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 06:27:56 -0700 Subject: [PATCH 30/52] update matrix filter --- .github/workflows/pr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5b7f790..820d51c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -69,7 +69,7 @@ jobs: script: "ci/test_conda.sh true" run_codecov: false # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber)) build-wheels: needs: - compute-matrix @@ -95,7 +95,7 @@ jobs: build_type: pull-request script: "ci/test_wheel.sh true" # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber)) build-docs: needs: - build-conda From 1201f1f670d98fc366c9943cab303e36c6487d88 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 06:57:39 -0700 Subject: [PATCH 31/52] simple filter --- .github/workflows/pr.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 820d51c..bbdba68 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -69,7 +69,8 @@ jobs: script: "ci/test_conda.sh true" run_codecov: false # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber)) + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) + build-wheels: needs: - compute-matrix @@ -95,7 +96,8 @@ jobs: build_type: pull-request script: "ci/test_wheel.sh true" # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER|split(".")|map(tonumber)|.[0]) == 12)) | max_by(.PY_VER|split(".")|map(tonumber)) + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) + build-docs: needs: - build-conda From 883d81741eb992de559dafa781906e7ecf583bf2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 06:57:57 -0700 Subject: [PATCH 32/52] clean --- .github/workflows/pr.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index bbdba68..1672742 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -70,7 +70,6 @@ jobs: run_codecov: false # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) - build-wheels: needs: - compute-matrix @@ -97,7 +96,6 @@ jobs: script: "ci/test_wheel.sh true" # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) - build-docs: needs: - build-conda From f41b931302aa43a99b59fd035b889e032fa22aa9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 07:08:41 -0700 Subject: [PATCH 33/52] .1 --- .github/workflows/pr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 1672742..5afa741 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -69,7 +69,7 @@ jobs: script: "ci/test_conda.sh true" run_codecov: false # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-wheels: needs: - compute-matrix @@ -95,7 +95,7 @@ jobs: build_type: pull-request script: "ci/test_wheel.sh true" # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5" and .PY_VER == "3.11")) + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-docs: needs: - build-conda From e5aa41e9feb230a0d472edf1508eb11743cc3354 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 07:23:35 -0700 Subject: [PATCH 34/52] revert --- .github/workflows/pr.yaml | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5afa741..a8a6d0a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,10 +19,8 @@ jobs: - compute-matrix - build-conda - test-conda - - test-conda-pynvjitlink - build-wheels - test-wheels - - test-wheels-pynvjitlink - build-docs secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 @@ -59,17 +57,6 @@ jobs: script: "ci/test_conda.sh false" run_codecov: false matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-conda-pynvjitlink: - needs: - - build-conda - - compute-matrix - uses: ./.github/workflows/conda-python-tests.yaml - with: - build_type: pull-request - script: "ci/test_conda.sh true" - run_codecov: false - # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-wheels: needs: - compute-matrix @@ -85,17 +72,6 @@ jobs: with: build_type: pull-request script: "ci/test_wheel.sh false" - matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-wheels-pynvjitlink: - needs: - - build-wheels - - compute-matrix - uses: ./.github/workflows/wheels-test.yaml - with: - build_type: pull-request - script: "ci/test_wheel.sh true" - # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-docs: needs: - build-conda From a0ea97d9618cda957cda0afad7f65ffbfcb5eb0f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 07:51:42 -0700 Subject: [PATCH 35/52] refactor --- .github/workflows/pr.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a8a6d0a..39066be 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,8 +19,10 @@ jobs: - compute-matrix - build-conda - test-conda + - test-conda-pynvjitlink - build-wheels - test-wheels + - test-wheels-pynvjitlink - build-docs secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 @@ -57,6 +59,16 @@ jobs: script: "ci/test_conda.sh false" run_codecov: false matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} + test-conda-pynvjitlink: + needs: + - build-conda + - compute-matrix + uses: ./.github/workflows/conda-python-tests.yaml + with: + build_type: pull-request + script: "ci/test_conda.sh true" + run_codecov: false + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) build-wheels: needs: - compute-matrix @@ -72,6 +84,14 @@ jobs: with: build_type: pull-request script: "ci/test_wheel.sh false" + test-wheels-pynvjitlink: + needs: + - build-wheels + uses: ./.github/workflows/wheels-test.yaml + with: + build_type: pull-request + script: "ci/test_wheel.sh true" + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) build-docs: needs: - build-conda From b979054b62a1e3d8165212330d09f1ca8b3f2b24 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 08:02:53 -0700 Subject: [PATCH 36/52] try and fix conda workflow --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 39066be..03650aa 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -68,7 +68,7 @@ jobs: build_type: pull-request script: "ci/test_conda.sh true" run_codecov: false - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) + matrix_filter: echo ${{ needs.compute-matrix.outputs.TEST_MATRIX }} | jq 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))' build-wheels: needs: - compute-matrix From beb330125d18126ee1c9176052208e55df48fef1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 08:51:47 -0700 Subject: [PATCH 37/52] readenv boolify string values --- numba_cuda/numba/cuda/cudadrv/driver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index d33c716..44cdbee 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -64,6 +64,8 @@ def _readenv(name, ctor, default): if value is None: return default() if callable(default) else default try: + if ctor is bool: + return bool(value.lower() in {'1', "True"}) return ctor(value) except Exception: warnings.warn( @@ -81,7 +83,6 @@ def _readenv(name, ctor, default): "to be available" ) - ENABLE_PYNVJITLINK = ( _readenv("ENABLE_PYNVJITLINK", bool, False) or getattr(config, "ENABLE_PYNVJITLINK", None) From 3f5a86535083d64bf7cdfba745966fd132e8488e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 13:18:03 -0700 Subject: [PATCH 38/52] fix imports --- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py | 5 +++++ numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index 31d142d..22e2ee8 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -1,5 +1,6 @@ import numpy as np import warnings +from numba.cuda.testing import unittest from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing) from numba.cuda.testing import CUDATestCase, test_data_dir from numba.cuda.cudadrv.driver import (CudaAPIError, Linker, @@ -310,3 +311,7 @@ def test_get_local_mem_per_specialized(self): local_mem_size = compiled_specialized.get_local_mem_per_thread() calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE self.assertGreaterEqual(local_mem_size, calc_size) + + +if __name__ == '__main__': + unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index e528e7f..4145e31 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -4,7 +4,6 @@ from numba.cuda.cudadrv.driver import PyNvJitLinker import itertools -from pynvjitlink.api import NvJitLinkError from numba.cuda import get_current_device from numba import cuda from numba import config @@ -12,6 +11,7 @@ HAVE_PYNVJITLINK = False try: import pynvjitlink # noqa: F401 + from pynvjitlink.api import NvJitLinkError HAVE_PYNVJITLINK = True except ImportError: From 4770c409ef81c768148c14ac55c48bc7d5e6444c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 17:08:55 -0700 Subject: [PATCH 39/52] update --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 03650aa..39066be 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -68,7 +68,7 @@ jobs: build_type: pull-request script: "ci/test_conda.sh true" run_codecov: false - matrix_filter: echo ${{ needs.compute-matrix.outputs.TEST_MATRIX }} | jq 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))' + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) build-wheels: needs: - compute-matrix From 2820ee6e313f76ead0222feb6e57bd1f0a7971ae Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Oct 2024 17:42:21 -0700 Subject: [PATCH 40/52] use local workflow matrix --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 39066be..0d6f948 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -68,7 +68,7 @@ jobs: build_type: pull-request script: "ci/test_conda.sh true" run_codecov: false - matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) + matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-wheels: needs: - compute-matrix From dc20cced941311967a803f62d450969837143917 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Oct 2024 10:58:58 -0700 Subject: [PATCH 41/52] cu12 suffix --- ci/test_wheel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 734cedb..afdc8ec 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -14,7 +14,7 @@ python -m pip install \ if [ "$USE_PYNVJITLINK" == true ]; then rapids-logger "Install pynvjitlink" - python -m pip install pynvjitlink + python -m pip install pynvjitlink-cu12 fi rapids-logger "Install wheel" From ff18c5c77662a09f7713b3eeb263863098e3e226 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 7 Oct 2024 09:56:44 -0500 Subject: [PATCH 42/52] Update ci/test_conda.sh Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com> --- ci/test_conda.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 0a23461..ccdff4e 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -51,7 +51,9 @@ set +e if [ "$USE_PYNVJITLINK" == true ]; then rapids-logger "Install pynvjitlink" + set +u conda install -c rapidsai pynvjitlink + set -u fi rapids-logger "Run Tests" From b2f4245e99d3fc84114f06539ca984a6dde93f24 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Oct 2024 08:37:55 -0700 Subject: [PATCH 43/52] small updates --- numba_cuda/numba/cuda/cudadrv/driver.py | 4 +--- .../numba/cuda/tests/cudadrv/test_nvjitlink.py | 16 +++------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 44cdbee..5a3a55a 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -10,7 +10,6 @@ system to freeze in some cases. """ - import sys import os import ctypes @@ -85,12 +84,11 @@ def _readenv(name, ctor, default): ENABLE_PYNVJITLINK = ( _readenv("ENABLE_PYNVJITLINK", bool, False) - or getattr(config, "ENABLE_PYNVJITLINK", None) + or getattr(config, "ENABLE_PYNVJITLINK", False) ) if not hasattr(config, "ENABLE_PYNVJITLINK"): config.ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK - if ENABLE_PYNVJITLINK: try: from pynvjitlink.api import NvJitLinker, NvJitLinkError diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 4145e31..7aa1b03 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -8,17 +8,8 @@ from numba import cuda from numba import config -HAVE_PYNVJITLINK = False -try: - import pynvjitlink # noqa: F401 - from pynvjitlink.api import NvJitLinkError - HAVE_PYNVJITLINK = True -except ImportError: - pass - - -@unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available") +@unittest.skipIf(config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled") @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"} @@ -35,6 +26,8 @@ def test_nvjitlink_create_no_cc_error(self): PyNvJitLinker() def test_nvjitlink_invalid_arch_error(self): + from pynvjitlink.api import NvJitLinkError + # CC 0.0 is not a valid compute capability with self.assertRaisesRegex( NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error" @@ -126,7 +119,6 @@ def test_nvjitlink_test_add_file_guess_ext_invalid_input(self): # because there's no way to know what kind of file to treat it as patched_linker.add_file_guess_ext(content) - @unittest.skipIf(not HAVE_PYNVJITLINK, "pynvjitlink not available") def test_nvjitlink_jit_with_linkable_code(self): files = ( "test_device_functions.a", @@ -138,8 +130,6 @@ def test_nvjitlink_jit_with_linkable_code(self): ) for file in files: with self.subTest(file=file): - # TODO: unsafe teardown if test errors - config.ENABLE_PYNVJITLINK = True sig = "uint32(uint32, uint32)" add_from_numba = cuda.declare_device("add_from_numba", sig) From f40d3ed5a82b2a0aa51d79d5f17414c34a68379a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Oct 2024 13:22:28 -0700 Subject: [PATCH 44/52] fix logic :) --- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 7aa1b03..79b3e71 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -9,7 +9,7 @@ from numba import config -@unittest.skipIf(config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled") +@unittest.skipIf(not config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled") @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"} From c24ec67a62bd8ef4f68b6563db3c5fe81d09c69b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Oct 2024 14:28:33 -0700 Subject: [PATCH 45/52] try hardcoding ENABLE_PYNVJITLINK --- ci/test_conda.sh | 2 +- ci/test_wheel.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index ccdff4e..83ca848 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -57,7 +57,7 @@ if [ "$USE_PYNVJITLINK" == true ]; then fi rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index afdc8ec..a9f346c 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -33,6 +33,6 @@ rapids-logger "Show Numba system info" python -m numba --sysinfo rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=$USE_PYNVJITLINK python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v popd From dccd6dbe58ae8e78abccab03e0d17a065a8de810 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Oct 2024 18:24:14 -0700 Subject: [PATCH 46/52] fix passing of ENABLE_PYNVJITLINK --- ci/test_conda.sh | 6 +++++- ci/test_wheel.sh | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 83ca848..4002e67 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -3,7 +3,11 @@ set -euo pipefail -USE_PYNVJITLINK=$1 +if [ "$1" == "true" ]; then + USE_PYNVJITLINK=true +else + USE_PYNVJITLINK=false +fi . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index a9f346c..b6ee2d1 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -3,7 +3,11 @@ set -euo pipefail -USE_PYNVJITLINK=$1 +if [ "$1" == "true" ]; then + USE_PYNVJITLINK=true +else + USE_PYNVJITLINK=false +fi rapids-logger "Install testing dependencies" # TODO: Replace with rapids-dependency-file-generator From 9aaa21f80e0bf2f29709e2c7f9ea247c1dd89080 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Oct 2024 08:23:41 -0700 Subject: [PATCH 47/52] ship makefile, find and build tests --- ci/build_tests.sh | 14 ++++++++++++++ ci/test_conda.sh | 1 + ci/test_wheel.sh | 1 + pyproject.toml | 2 +- 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100755 ci/build_tests.sh diff --git a/ci/build_tests.sh b/ci/build_tests.sh new file mode 100755 index 0000000..8bf6fe3 --- /dev/null +++ b/ci/build_tests.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION + +PY_SCRIPT=" +import numba_cuda +root = numba_cuda.__file__.rstrip('__init__.py') +test_dir = root + numba/cuda/tests/test_binary_generation/ +print(test_dir) +" + +TEST_DIR=$(python -c "$PY_SCRIPT") +pushd $TEST_DIR +make +popd diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 4002e67..200e91f 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -58,6 +58,7 @@ if [ "$USE_PYNVJITLINK" == true ]; then set +u conda install -c rapidsai pynvjitlink set -u + sh build_tests.sh fi rapids-logger "Run Tests" diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index b6ee2d1..a69b801 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -19,6 +19,7 @@ python -m pip install \ if [ "$USE_PYNVJITLINK" == true ]; then rapids-logger "Install pynvjitlink" python -m pip install pynvjitlink-cu12 + sh build_tests.sh fi rapids-logger "Install wheel" diff --git a/pyproject.toml b/pyproject.toml index 4f2291b..e7aefdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,4 +36,4 @@ license-files = ["LICENSE"] include = ["numba_cuda*"] [tool.setuptools.package-data] -"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION"] +"*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "VERSION", "Makefile"] From d3ca53ccd381b25d60ac072fa91865fc5d3564cc Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Oct 2024 08:33:01 -0700 Subject: [PATCH 48/52] minor fix --- ci/build_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_tests.sh b/ci/build_tests.sh index 8bf6fe3..9d0f65d 100755 --- a/ci/build_tests.sh +++ b/ci/build_tests.sh @@ -4,7 +4,7 @@ PY_SCRIPT=" import numba_cuda root = numba_cuda.__file__.rstrip('__init__.py') -test_dir = root + numba/cuda/tests/test_binary_generation/ +test_dir = root + \"numba/cuda/tests/test_binary_generation/\" print(test_dir) " From 4ce95a73038868e75e8215bbd82d81a4390d23e2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Oct 2024 11:14:03 -0700 Subject: [PATCH 49/52] bifurcate pynvjitlink test scripts --- .github/workflows/pr.yaml | 6 +-- ci/build_tests.sh | 14 ------- ci/test_conda.sh | 16 +------- ci/test_conda_pynvjitlink.sh | 77 ++++++++++++++++++++++++++++++++++++ ci/test_wheel.sh | 14 +------ ci/test_wheel_pynvjitlink.sh | 47 ++++++++++++++++++++++ 6 files changed, 129 insertions(+), 45 deletions(-) delete mode 100755 ci/build_tests.sh create mode 100755 ci/test_conda_pynvjitlink.sh create mode 100755 ci/test_wheel_pynvjitlink.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 0d6f948..8ef0c4e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -56,7 +56,7 @@ jobs: uses: ./.github/workflows/conda-python-tests.yaml with: build_type: pull-request - script: "ci/test_conda.sh false" + script: "ci/test_conda.sh" run_codecov: false matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} test-conda-pynvjitlink: @@ -66,7 +66,7 @@ jobs: uses: ./.github/workflows/conda-python-tests.yaml with: build_type: pull-request - script: "ci/test_conda.sh true" + script: "ci/test_conda_pynvjitlink.sh" run_codecov: false matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11")) build-wheels: @@ -90,7 +90,7 @@ jobs: uses: ./.github/workflows/wheels-test.yaml with: build_type: pull-request - script: "ci/test_wheel.sh true" + script: "ci/test_wheel_pynvjitlink.sh" matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12")) build-docs: needs: diff --git a/ci/build_tests.sh b/ci/build_tests.sh deleted file mode 100755 index 9d0f65d..0000000 --- a/ci/build_tests.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, NVIDIA CORPORATION - -PY_SCRIPT=" -import numba_cuda -root = numba_cuda.__file__.rstrip('__init__.py') -test_dir = root + \"numba/cuda/tests/test_binary_generation/\" -print(test_dir) -" - -TEST_DIR=$(python -c "$PY_SCRIPT") -pushd $TEST_DIR -make -popd diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 200e91f..d6e8bc0 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -3,12 +3,6 @@ set -euo pipefail -if [ "$1" == "true" ]; then - USE_PYNVJITLINK=true -else - USE_PYNVJITLINK=false -fi - . /opt/conda/etc/profile.d/conda.sh if [ "${CUDA_VER%.*.*}" = "11" ]; then @@ -53,16 +47,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -if [ "$USE_PYNVJITLINK" == true ]; then - rapids-logger "Install pynvjitlink" - set +u - conda install -c rapidsai pynvjitlink - set -u - sh build_tests.sh -fi - rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v +python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh new file mode 100755 index 0000000..a0ea90f --- /dev/null +++ b/ci/test_conda_pynvjitlink.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +if [ "${CUDA_VER%.*.*}" = "11" ]; then + CTK_PACKAGES="cudatoolkit" +else + CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc" +fi + +rapids-logger "Install testing dependencies" +# TODO: Replace with rapids-dependency-file-generator +rapids-mamba-retry create -n test \ + c-compiler \ + cxx-compiler \ + ${CTK_PACKAGES} \ + cuda-python \ + cuda-version=${CUDA_VER%.*} \ + make \ + psutil \ + pytest \ + python=${RAPIDS_PY_VERSION} + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" +pushd "${RAPIDS_TESTS_DIR}" + +rapids-print-env + +rapids-logger "Check GPU usage" +nvidia-smi + +rapids-logger "Show Numba system info" +python -m numba --sysinfo + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + + +rapids-logger "Install pynvjitlink" +set +u +conda install -c rapidsai pynvjitlink +set -u + +rapids_logger "Build tests" + +PY_SCRIPT=" +import numba_cuda +root = numba_cuda.__file__.rstrip('__init__.py') +test_dir = root + \"numba/cuda/tests/test_binary_generation/\" +print(test_dir) +" + +TEST_DIR=$(python -c "$PY_SCRIPT") +pushd $TEST_DIR +make +popd + + +rapids-logger "Run Tests" +ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v + +popd + +rapids-logger "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index a69b801..5ffd014 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -3,12 +3,6 @@ set -euo pipefail -if [ "$1" == "true" ]; then - USE_PYNVJITLINK=true -else - USE_PYNVJITLINK=false -fi - rapids-logger "Install testing dependencies" # TODO: Replace with rapids-dependency-file-generator python -m pip install \ @@ -16,12 +10,6 @@ python -m pip install \ cuda-python \ pytest -if [ "$USE_PYNVJITLINK" == true ]; then - rapids-logger "Install pynvjitlink" - python -m pip install pynvjitlink-cu12 - sh build_tests.sh -fi - rapids-logger "Install wheel" package=$(realpath wheel/numba_cuda*.whl) echo "Package path: $package" @@ -38,6 +26,6 @@ rapids-logger "Show Numba system info" python -m numba --sysinfo rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v +python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh new file mode 100755 index 0000000..4d3bb57 --- /dev/null +++ b/ci/test_wheel_pynvjitlink.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION + +set -euo pipefail + +rapids-logger "Install testing dependencies" +# TODO: Replace with rapids-dependency-file-generator +python -m pip install \ + psutil \ + cuda-python \ + pytest + +rapids-logger "Install pynvjitlink" +python -m pip install pynvjitlink-cu12 + +rapids-logger "Build tests" +PY_SCRIPT=" +import numba_cuda +root = numba_cuda.__file__.rstrip('__init__.py') +test_dir = root + \"numba/cuda/tests/test_binary_generation/\" +print(test_dir) +" + +TEST_DIR=$(python -c "$PY_SCRIPT") +pushd $TEST_DIR +make +popd + +rapids-logger "Install wheel" +package=$(realpath wheel/numba_cuda*.whl) +echo "Package path: $package" +python -m pip install $package + +rapids-logger "Check GPU usage" +nvidia-smi + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" +pushd "${RAPIDS_TESTS_DIR}" + +rapids-logger "Show Numba system info" +python -m numba --sysinfo + +rapids-logger "Run Tests" +ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v + +popd From d65f80dcbb9942eb1c4d2faee7ec4661c72d3fd2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Oct 2024 11:49:05 -0700 Subject: [PATCH 50/52] pass test bin dir as an env var --- ci/test_conda_pynvjitlink.sh | 8 +-- ci/test_wheel_pynvjitlink.sh | 6 +- .../cuda/tests/cudadrv/test_nvjitlink.py | 61 +++++++++++++------ .../tests/test_binary_generation/Makefile | 2 +- 4 files changed, 52 insertions(+), 25 deletions(-) diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh index a0ea90f..ca3bef2 100755 --- a/ci/test_conda_pynvjitlink.sh +++ b/ci/test_conda_pynvjitlink.sh @@ -53,7 +53,7 @@ set +u conda install -c rapidsai pynvjitlink set -u -rapids_logger "Build tests" +rapids-logger "Build tests" PY_SCRIPT=" import numba_cuda @@ -62,14 +62,14 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\" print(test_dir) " -TEST_DIR=$(python -c "$PY_SCRIPT") -pushd $TEST_DIR +NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT") +pushd $NUMBA_CUDA_TEST_BIN_DIR make popd rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh index 4d3bb57..0838bda 100755 --- a/ci/test_wheel_pynvjitlink.sh +++ b/ci/test_wheel_pynvjitlink.sh @@ -21,8 +21,8 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\" print(test_dir) " -TEST_DIR=$(python -c "$PY_SCRIPT") -pushd $TEST_DIR +NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT") +pushd $NUMBA_CUDA_TEST_BIN_DIR make popd @@ -42,6 +42,6 @@ rapids-logger "Show Numba system info" python -m numba --sysinfo rapids-logger "Run Tests" -ENABLE_PYNVJITLINK=1 python -m numba.runtests numba.cuda.tests -v +ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v popd diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 79b3e71..36c216a 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -4,12 +4,39 @@ from numba.cuda.cudadrv.driver import PyNvJitLinker import itertools +import os from numba.cuda import get_current_device from numba import cuda from numba import config - -@unittest.skipIf(not config.ENABLE_PYNVJITLINK, "pynvjitlink not enabled") +TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") +test_device_functions_a = os.path.join( + TEST_BIN_DIR, "test_device_functions.a" +) +test_device_functions_cubin = os.path.join( + TEST_BIN_DIR, "test_device_functions.cubin" +) +test_device_functions_cu = os.path.join( + TEST_BIN_DIR, "test_device_functions.cu" +) +test_device_functions_fatbin = os.path.join( + TEST_BIN_DIR, "test_device_functions.fatbin" +) +test_device_functions_o = os.path.join( + TEST_BIN_DIR, "test_device_functions.o" +) +test_device_functions_ptx = os.path.join( + TEST_BIN_DIR, "test_device_functions.ptx" +) +test_device_functions_ltoir = os.path.join( + TEST_BIN_DIR, "test_device_functions.ltoir" +) + + +@unittest.skipIf( + not config.ENABLE_PYNVJITLINK or not TEST_BIN_DIR, + "pynvjitlink not enabled" +) @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"} @@ -91,12 +118,12 @@ def test_nvjitlink_ptx_compile_options(self): def test_nvjitlink_add_file_guess_ext_linkable_code(self): files = ( - "test_device_functions.a", - "test_device_functions.cubin", - "test_device_functions.cu", - "test_device_functions.fatbin", - "test_device_functions.o", - "test_device_functions.ptx", + test_device_functions_a, + test_device_functions_cubin, + test_device_functions_cu, + test_device_functions_fatbin, + test_device_functions_o, + test_device_functions_ptx, ) for file in files: with self.subTest(file=file): @@ -106,7 +133,7 @@ def test_nvjitlink_add_file_guess_ext_linkable_code(self): patched_linker.add_file_guess_ext(file) def test_nvjitlink_test_add_file_guess_ext_invalid_input(self): - with open("test_device_functions.cubin", "rb") as f: + with open(test_device_functions_cubin, "rb") as f: content = f.read() patched_linker = PyNvJitLinker( @@ -121,12 +148,12 @@ def test_nvjitlink_test_add_file_guess_ext_invalid_input(self): def test_nvjitlink_jit_with_linkable_code(self): files = ( - "test_device_functions.a", - "test_device_functions.cubin", - "test_device_functions.cu", - "test_device_functions.fatbin", - "test_device_functions.o", - "test_device_functions.ptx", + test_device_functions_a, + test_device_functions_cubin, + test_device_functions_cu, + test_device_functions_fatbin, + test_device_functions_o, + test_device_functions_ptx, ) for file in files: with self.subTest(file=file): @@ -142,7 +169,7 @@ def kernel(result): assert result[0] == 3 def test_nvjitlink_jit_with_linkable_code_lto(self): - file = "test_device_functions.ltoir" + file = test_device_functions_ltoir sig = "uint32(uint32, uint32)" add_from_numba = cuda.declare_device("add_from_numba", sig) @@ -156,7 +183,7 @@ def kernel(result): assert result[0] == 3 def test_nvjitlink_jit_with_invalid_linkable_code(self): - with open("test_device_functions.cubin", "rb") as f: + with open(test_device_functions_cubin, "rb") as f: content = f.read() with self.assertRaisesRegex( TypeError, "Expected path to file or a LinkableCode" diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile index adde5e3..31a91b2 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile @@ -32,7 +32,7 @@ LIBRARY_FLAGS := $(GENCODE) -lib FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin LTOIR_FLAGS := $(LTOIR_GENCODE) -dc -OUTPUT_DIR := ../cudadrv +OUTPUT_DIR := ./ all: @echo "GPU CC: $(GPU_CC)" From 5097bcf58715d30bbf3de77c858f34ef96f5bd3f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Oct 2024 12:13:29 -0700 Subject: [PATCH 51/52] more minor fixes --- .../cuda/tests/cudadrv/test_nvjitlink.py | 43 ++++++++++--------- .../tests/test_binary_generation/Makefile | 4 +- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 36c216a..e9ff671 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -10,27 +10,28 @@ from numba import config TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") -test_device_functions_a = os.path.join( - TEST_BIN_DIR, "test_device_functions.a" -) -test_device_functions_cubin = os.path.join( - TEST_BIN_DIR, "test_device_functions.cubin" -) -test_device_functions_cu = os.path.join( - TEST_BIN_DIR, "test_device_functions.cu" -) -test_device_functions_fatbin = os.path.join( - TEST_BIN_DIR, "test_device_functions.fatbin" -) -test_device_functions_o = os.path.join( - TEST_BIN_DIR, "test_device_functions.o" -) -test_device_functions_ptx = os.path.join( - TEST_BIN_DIR, "test_device_functions.ptx" -) -test_device_functions_ltoir = os.path.join( - TEST_BIN_DIR, "test_device_functions.ltoir" -) +if TEST_BIN_DIR: + test_device_functions_a = os.path.join( + TEST_BIN_DIR, "test_device_functions.a" + ) + test_device_functions_cubin = os.path.join( + TEST_BIN_DIR, "test_device_functions.cubin" + ) + test_device_functions_cu = os.path.join( + TEST_BIN_DIR, "test_device_functions.cu" + ) + test_device_functions_fatbin = os.path.join( + TEST_BIN_DIR, "test_device_functions.fatbin" + ) + test_device_functions_o = os.path.join( + TEST_BIN_DIR, "test_device_functions.o" + ) + test_device_functions_ptx = os.path.join( + TEST_BIN_DIR, "test_device_functions.ptx" + ) + test_device_functions_ltoir = os.path.join( + TEST_BIN_DIR, "test_device_functions.ltoir" + ) @unittest.skipIf( diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile index 31a91b2..3985b4e 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile @@ -49,6 +49,4 @@ all: nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu # Generate LTO-IR in a "raw" LTO-IR container python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu - # We also want to test linking a .cu file; this needs no compilation, - # so copy it instead - cp test_device_functions.cu $(OUTPUT_DIR) + From e29744cbb834b6a85bbd582a594d396aca5e359d Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:05:08 +0100 Subject: [PATCH 52/52] Retry installation of pynvjitlink --- ci/test_conda_pynvjitlink.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh index ca3bef2..71e41e4 100755 --- a/ci/test_conda_pynvjitlink.sh +++ b/ci/test_conda_pynvjitlink.sh @@ -50,7 +50,7 @@ set +e rapids-logger "Install pynvjitlink" set +u -conda install -c rapidsai pynvjitlink +rapids-mamba-retry install -c rapidsai pynvjitlink set -u rapids-logger "Build tests"