rapidsai · rapids-bot · May 23, 2023 · May 11, 2023 · May 11, 2023 · May 11, 2023
@@ -5,7 +5,7 @@
 validate_setup()
 
 import cupy
-from numba import config as numba_config, cuda
+from numba import config as numba_config
 
 import rmm
 from rmm.allocators.cupy import rmm_cupy_allocator
@@ -80,35 +80,17 @@
     read_text,
 )
 from cudf.options import describe_option, get_option, set_option
+from cudf.utils._setup_numba import _setup_numba
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
-try:
-    from cubinlinker.patch import patch_numba_linker_if_needed
-except ImportError:
-    pass
-else:
-    # Patch Numba to support CUDA enhanced compatibility.
-    # cuDF requires a stronger set of conditions than what is
-    # checked by patch_numba_linker_if_needed due to the PTX
-    # files needed for JIT Groupby Apply and string UDFs
-    from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker
+_setup_numba()
-_setup_numba()
+# Must be called before importing numba.cuda.
+_setup_numba()
-_setup_numba()
+# Must be called before importing numba.cuda.
+_setup_numba()
 
-    _setup_numba_linker(_PTX_FILE)
-
-    del patch_numba_linker_if_needed
+from numba import cuda
 
 cuda.set_memory_manager(RMMNumbaManager)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 
-try:
-    # Numba 0.54: Disable low occupancy warnings
-    numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-except AttributeError:
-    # Numba < 0.54: No occupancy warnings
-    pass
-del numba_config
-
 
 rmm.register_reinitialize_hook(clear_cache)
 

@@ -68,6 +68,7 @@
     _return_arr_from_dtype,
 )
 from cudf.utils import docutils
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 doc_reset_index_template = """
@@ -2193,7 +2194,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         input_args = _get_input_args_from_frame(self)
         launch_args = output_args + input_args + list(args)
         try:
-            kernel.forall(len(self))(*launch_args)
+            with CUDFNumbaConfig():
+                kernel.forall(len(self))(*launch_args)
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
 

@@ -27,6 +27,7 @@
     _supported_cols_from_frame,
     _supported_dtypes_from_frame,
 )
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 
@@ -196,7 +197,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     )
 
     # Launch kernel
-    specialized[ngroups, tpb](*launch_args)
+    with CUDFNumbaConfig():
+        specialized[ngroups, tpb](*launch_args)
 
     return output
 

@@ -1,6 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-import glob
 import os
 from typing import Any, Callable, Dict
 
@@ -13,7 +12,6 @@
 from numba.core.datamodel import default_manager, models
 from numba.core.errors import TypingError
 from numba.core.extending import register_model
-from numba.cuda.cudadrv.driver import Linker
 from numba.np import numpy_support
 from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void
 
@@ -33,6 +31,7 @@
     udf_string,
 )
 from cudf.utils import cudautils
+from cudf.utils._setup_numba import _get_ptx_file
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -63,58 +62,6 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 launch_arg_getters: Dict[Any, Any] = {}
 
-
-def _get_best_ptx_file(archs, max_compute_capability):
-    """
-    Determine of the available PTX files which one is
-    the most recent up to and including the device cc
-    """
-    filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
-    if filtered_archs:
-        return max(filtered_archs, key=lambda y: y[0])
-    else:
-        return None
-
-
-def _get_ptx_file(path, prefix):
-    if "RAPIDS_NO_INITIALIZE" in os.environ:
-        # cc=60 ptx is always built
-        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
-    else:
-        dev = cuda.get_current_device()
-
-        # Load the highest compute capability file available that is less than
-        # the current device's.
-        cc = int("".join(str(x) for x in dev.compute_capability))
-    files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
-    if len(files) == 0:
-        raise RuntimeError(f"Missing PTX files for cc={cc}")
-    regular_sms = []
-
-    for f in files:
-        file_name = os.path.basename(f)
-        sm_number = file_name.rstrip(".ptx").lstrip(prefix)
-        if sm_number.endswith("a"):
-            processed_sm_number = int(sm_number.rstrip("a"))
-            if processed_sm_number == cc:
-                return f
-        else:
-            regular_sms.append((int(sm_number), f))
-
-    regular_result = None
-
-    if regular_sms:
-        regular_result = _get_best_ptx_file(regular_sms, cc)
-
-    if regular_result is None:
-        raise RuntimeError(
-            "This cuDF installation is missing the necessary PTX "
-            f"files that are <={cc}."
-        )
-    else:
-        return regular_result[1]
-
-
 _PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")
 
 
@@ -306,6 +253,16 @@ def _compile_or_get(
     we then obtain the return type from that separate compilation and
     use it to allocate an output column of the right dtype.
     """
+    # runtime check for CEC mode which is disabled for CUDA 12 for now
+    if cuda.cudadrv.driver.get_version() == (
+        12,
+        0,
+    ) and cuda.cudadrv.runtime.get_version() > (12, 0):
+        raise ValueError(
+            "Minor version compatibility not yet supported for "
+            "CUDA driver versions newer than 12.0"
+        )
+
     if not all(is_scalar(arg) for arg in args):
         raise TypeError("only scalar valued args are supported by apply")
 
@@ -392,97 +349,6 @@ def _get_extensionty_size(ty):
     return llty.get_abi_size(target_data)
 
 
-def _get_cuda_version_from_ptx_file(path):
-    """
-    https://docs.nvidia.com/cuda/parallel-thread-execution/
-    Each PTX module must begin with a .version
-    directive specifying the PTX language version
-
-    example header:
-    //
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-31057947
-    // Cuda compilation tools, release 11.6, V11.6.124
-    // Based on NVVM 7.0.1
-    //
-
-    .version 7.6
-    .target sm_52
-    .address_size 64
-
-    """
-    with open(path) as ptx_file:
-        for line in ptx_file:
-            if line.startswith(".version"):
-                ver_line = line
-                break
-        else:
-            raise ValueError("Could not read CUDA version from ptx file.")
-    version = ver_line.strip("\n").split(" ")[1]
-    # from ptx_docs/release_notes above:
-    ver_map = {
-        "7.5": (11, 5),
-        "7.6": (11, 6),
-        "7.7": (11, 7),
-        "7.8": (11, 8),
-        "8.0": (12, 0),
-    }
-
-    cuda_ver = ver_map.get(version)
-    if cuda_ver is None:
-        raise ValueError(
-            f"Could not map PTX version {version} to a CUDA version"
-        )
-
-    return cuda_ver
-
-
-def _setup_numba_linker(path):
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    from cudf.core.udf.utils import (
-        _get_cuda_version_from_ptx_file,
-        maybe_patch_numba_linker,
-    )
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-        maybe_patch_numba_linker(
-            driver_version, runtime_version, ptx_toolkit_version
-        )
-
-
-def maybe_patch_numba_linker(
-    driver_version, runtime_version, ptx_toolkit_version
-):
-    from cubinlinker.patch import (
-        _numba_version_ok,
-        get_logger,
-        new_patched_linker,
-    )
-
-    # Numba thinks cubinlinker is only needed if the driver is older than
-    # the ctk, but when PTX files are present, it might also need to patch
-    # because those PTX files may newer than the driver as well
-    logger = get_logger()
-
-    if (driver_version < ptx_toolkit_version) or (
-        driver_version < runtime_version
-    ):
-        logger.debug(
-            "Driver version %s.%s needs patching due to PTX files"
-            % driver_version
-        )
-        if _numba_version_ok:
-            logger.debug("Patching Numba Linker")
-            Linker.new = new_patched_linker
-        else:
-            logger.debug("Cannot patch Numba Linker - unsupported version")
-
-
 @initfunc
 def set_malloc_heap_size(size=None):
     """

diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -8,6 +8,7 @@
 
 from cudf.core.dataframe import DataFrame
 from cudf.testing._utils import ALL_TYPES, assert_eq
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 """
 DataFrame copy expectations
@@ -159,8 +160,8 @@ def test_kernel_deep_copy():
     gdf = DataFrame.from_pandas(pdf)
     cdf = gdf.copy(deep=True)
     sr = gdf["b"]
-
-    add_one[1, len(sr)](sr._column.data_array_view(mode="write"))
+    with CUDFNumbaConfig():
+        add_one[1, len(sr)](sr._column.data_array_view(mode="write"))
     assert not gdf.to_string().split() == cdf.to_string().split()
 
 

@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
@@ -12,6 +12,7 @@
 from cudf.core.udf.api import Masked
 from cudf.core.udf.masked_typing import MaskedType
 from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 arith_ops = (
     operator.add,
@@ -106,7 +107,8 @@ def test_kernel(x, y, err):
             err[0] = 3
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](1, 2, err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](1, 2, err)
     assert err[0] == 0
 
 
@@ -214,7 +216,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -304,7 +307,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -326,5 +330,6 @@ def test_kernel(err):
             err[0] = 1
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
@@ -22,6 +22,7 @@
 )
 from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
 from cudf.testing._utils import assert_eq, sv_to_udf_str
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 
 def get_kernels(func, dtype, size):
@@ -85,16 +86,17 @@ def run_udf_test(data, func, dtype):
     sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))
 
     expect = pd.Series(data).apply(func)
-
-    sv_kernel.forall(len(data))(str_views, output)
+    with CUDFNumbaConfig():
+        sv_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else:
         result = output
 
     got = cudf.Series(result, dtype=dtype)
     assert_eq(expect, got, check_dtype=False)
-    udf_str_kernel.forall(len(data))(str_views, output)
+    with CUDFNumbaConfig():
+        udf_str_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else: