Skip to content

Commit

Permalink
cuDF numba cuda 12 updates (#13337)
Browse files Browse the repository at this point in the history
Summary of changes:

- Removed some old code that is only used for `numba<0.54` which hasn't been supported for a while now.
- Removed some old code that is only used when `cubinlinker` is not present, which is has been a hard requirement for a while now as well. 
- Created a file `_numba.py` and moved into this file all of the machinery used to configure numba upon cuDF import. This includes functions for determining which toolkit version was used to build the PTX file our UDFs rely on as well as the functions for potentially putting numba into MVC mode if necessary.
- Created a file `_ptxcompiler.py` which vendors the driver/runtime version checking machinery from ptxcompiler in case we're in a cuda 12 environment that doesn't have it
- Changed the code to issue a warning in cuda 12+ MVC situations that the library will likely not work
- The version of the toolkit used to determine if MVC is required is now determined from the `cc=60` PTX file which is [always built](https://github.com/rapidsai/cudf/blob/branch-23.06/python/cudf/udf_cpp/CMakeLists.txt#L85-L87). This is to avoid needing to query the device compute capability through numba's `cuda` module. This needs to be avoided during numba's setup because if `numba.cuda` is imported before numba's config is modified, the config options will have no effect. 

Closes #13351
Closes #13339

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Bradley Dice (https://github.com/bdice)
  - Graham Markall (https://github.com/gmarkall)
  - Ashwin Srinath (https://github.com/shwina)

URL: #13337
  • Loading branch information
brandon-b-miller authored May 23, 2023
1 parent 4fdb60d commit 12acf92
Show file tree
Hide file tree
Showing 16 changed files with 343 additions and 206 deletions.
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba>=0.56.4,<0.57
- numba>=0.57
- numpy>=1.21,<1.24
- numpydoc
- nvcc_linux-64=11.8
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ requirements:
- typing_extensions
- pandas >=1.3,<1.6.0dev0
- cupy >=12.0.0
- numba >=0.56.4,<0.57
- numba >=0.57
- numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations
- {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
- libcudf {{ version }}
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ dependencies:
packages:
- cachetools
- cuda-python>=11.7.1,<12.0
- &numba numba>=0.56.4,<0.57
- &numba numba>=0.57
- nvtx>=0.2.1
- packaging
- rmm==23.6.*
Expand Down
28 changes: 5 additions & 23 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

# _setup_numba _must be called before numba.cuda is imported, because
# it sets the numba config variable responsible for enabling
# Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
from cudf.utils._numba import _setup_numba
from cudf.utils.gpu_utils import validate_setup

_setup_numba()
validate_setup()

import cupy
Expand Down Expand Up @@ -83,32 +88,9 @@
from cudf.utils.dtypes import _NA_REP
from cudf.utils.utils import clear_cache, set_allocator

try:
from cubinlinker.patch import patch_numba_linker_if_needed
except ImportError:
pass
else:
# Patch Numba to support CUDA enhanced compatibility.
# cuDF requires a stronger set of conditions than what is
# checked by patch_numba_linker_if_needed due to the PTX
# files needed for JIT Groupby Apply and string UDFs
from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker

_setup_numba_linker(_PTX_FILE)

del patch_numba_linker_if_needed

cuda.set_memory_manager(RMMNumbaManager)
cupy.cuda.set_allocator(rmm_cupy_allocator)

try:
# Numba 0.54: Disable low occupancy warnings
numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
except AttributeError:
# Numba < 0.54: No occupancy warnings
pass
del numba_config


rmm.register_reinitialize_hook(clear_cache)

Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
_return_arr_from_dtype,
)
from cudf.utils import docutils
from cudf.utils._numba import _CUDFNumbaConfig
from cudf.utils.utils import _cudf_nvtx_annotate

doc_reset_index_template = """
Expand Down Expand Up @@ -2193,7 +2194,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
input_args = _get_input_args_from_frame(self)
launch_args = output_args + input_args + list(args)
try:
kernel.forall(len(self))(*launch_args)
with _CUDFNumbaConfig():
kernel.forall(len(self))(*launch_args)
except Exception as e:
raise RuntimeError("UDF kernel execution failed.") from e

Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/udf/groupby_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_supported_cols_from_frame,
_supported_dtypes_from_frame,
)
from cudf.utils._numba import _CUDFNumbaConfig
from cudf.utils.utils import _cudf_nvtx_annotate


Expand Down Expand Up @@ -196,7 +197,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
)

# Launch kernel
specialized[ngroups, tpb](*launch_args)
with _CUDFNumbaConfig():
specialized[ngroups, tpb](*launch_args)

return output

Expand Down
146 changes: 1 addition & 145 deletions python/cudf/cudf/core/udf/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import glob
import os
from typing import Any, Callable, Dict

Expand All @@ -13,7 +12,6 @@
from numba.core.datamodel import default_manager, models
from numba.core.errors import TypingError
from numba.core.extending import register_model
from numba.cuda.cudadrv.driver import Linker
from numba.np import numpy_support
from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void

Expand All @@ -33,6 +31,7 @@
udf_string,
)
from cudf.utils import cudautils
from cudf.utils._numba import _get_ptx_file
from cudf.utils.dtypes import (
BOOL_TYPES,
DATETIME_TYPES,
Expand Down Expand Up @@ -63,58 +62,6 @@
precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
launch_arg_getters: Dict[Any, Any] = {}


def _get_best_ptx_file(archs, max_compute_capability):
"""
Determine of the available PTX files which one is
the most recent up to and including the device cc
"""
filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
if filtered_archs:
return max(filtered_archs, key=lambda y: y[0])
else:
return None


def _get_ptx_file(path, prefix):
if "RAPIDS_NO_INITIALIZE" in os.environ:
# cc=60 ptx is always built
cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
else:
dev = cuda.get_current_device()

# Load the highest compute capability file available that is less than
# the current device's.
cc = int("".join(str(x) for x in dev.compute_capability))
files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
if len(files) == 0:
raise RuntimeError(f"Missing PTX files for cc={cc}")
regular_sms = []

for f in files:
file_name = os.path.basename(f)
sm_number = file_name.rstrip(".ptx").lstrip(prefix)
if sm_number.endswith("a"):
processed_sm_number = int(sm_number.rstrip("a"))
if processed_sm_number == cc:
return f
else:
regular_sms.append((int(sm_number), f))

regular_result = None

if regular_sms:
regular_result = _get_best_ptx_file(regular_sms, cc)

if regular_result is None:
raise RuntimeError(
"This cuDF installation is missing the necessary PTX "
f"files that are <={cc}."
)
else:
return regular_result[1]


_PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")


Expand Down Expand Up @@ -392,97 +339,6 @@ def _get_extensionty_size(ty):
return llty.get_abi_size(target_data)


def _get_cuda_version_from_ptx_file(path):
"""
https://docs.nvidia.com/cuda/parallel-thread-execution/
Each PTX module must begin with a .version
directive specifying the PTX language version
example header:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31057947
// Cuda compilation tools, release 11.6, V11.6.124
// Based on NVVM 7.0.1
//
.version 7.6
.target sm_52
.address_size 64
"""
with open(path) as ptx_file:
for line in ptx_file:
if line.startswith(".version"):
ver_line = line
break
else:
raise ValueError("Could not read CUDA version from ptx file.")
version = ver_line.strip("\n").split(" ")[1]
# from ptx_docs/release_notes above:
ver_map = {
"7.5": (11, 5),
"7.6": (11, 6),
"7.7": (11, 7),
"7.8": (11, 8),
"8.0": (12, 0),
}

cuda_ver = ver_map.get(version)
if cuda_ver is None:
raise ValueError(
f"Could not map PTX version {version} to a CUDA version"
)

return cuda_ver


def _setup_numba_linker(path):
from ptxcompiler.patch import NO_DRIVER, safe_get_versions

from cudf.core.udf.utils import (
_get_cuda_version_from_ptx_file,
maybe_patch_numba_linker,
)

versions = safe_get_versions()
if versions != NO_DRIVER:
driver_version, runtime_version = versions
ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
maybe_patch_numba_linker(
driver_version, runtime_version, ptx_toolkit_version
)


def maybe_patch_numba_linker(
driver_version, runtime_version, ptx_toolkit_version
):
from cubinlinker.patch import (
_numba_version_ok,
get_logger,
new_patched_linker,
)

# Numba thinks cubinlinker is only needed if the driver is older than
# the ctk, but when PTX files are present, it might also need to patch
# because those PTX files may newer than the driver as well
logger = get_logger()

if (driver_version < ptx_toolkit_version) or (
driver_version < runtime_version
):
logger.debug(
"Driver version %s.%s needs patching due to PTX files"
% driver_version
)
if _numba_version_ok:
logger.debug("Patching Numba Linker")
Linker.new = new_patched_linker
else:
logger.debug("Cannot patch Numba Linker - unsupported version")


@initfunc
def set_malloc_heap_size(size=None):
"""
Expand Down
15 changes: 10 additions & 5 deletions python/cudf/cudf/tests/test_extension_compilation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
import operator

import cupy as cp
Expand All @@ -12,6 +12,7 @@
from cudf.core.udf.api import Masked
from cudf.core.udf.masked_typing import MaskedType
from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
from cudf.utils._numba import _CUDFNumbaConfig

arith_ops = (
operator.add,
Expand Down Expand Up @@ -106,7 +107,8 @@ def test_kernel(x, y, err):
err[0] = 3

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](1, 2, err)
with _CUDFNumbaConfig():
test_kernel[1, 1](1, 2, err)
assert err[0] == 0


Expand Down Expand Up @@ -214,7 +216,8 @@ def test_kernel(err):
err[0] = 2

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0


Expand Down Expand Up @@ -304,7 +307,8 @@ def test_kernel(err):
err[0] = 2

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0


Expand All @@ -326,5 +330,6 @@ def test_kernel(err):
err[0] = 1

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0
8 changes: 5 additions & 3 deletions python/cudf/cudf/tests/test_string_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
from cudf.testing._utils import assert_eq, sv_to_udf_str
from cudf.utils._numba import _CUDFNumbaConfig


def get_kernels(func, dtype, size):
Expand Down Expand Up @@ -85,16 +86,17 @@ def run_udf_test(data, func, dtype):
sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))

expect = pd.Series(data).apply(func)

sv_kernel.forall(len(data))(str_views, output)
with _CUDFNumbaConfig():
sv_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
else:
result = output

got = cudf.Series(result, dtype=dtype)
assert_eq(expect, got, check_dtype=False)
udf_str_kernel.forall(len(data))(str_views, output)
with _CUDFNumbaConfig():
udf_str_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
else:
Expand Down
Loading

0 comments on commit 12acf92

Please sign in to comment.