Use CubinLinker for CUDA Minor Version Compatibility (#11701)

## Description This switches to using CubinLinker (from PTXCompiler, but CubinLinker uses PTXCompiler internally) for Minor Version Compatibility. This enables support for all Numba features except linking archives with MVC, in support of use cases such as String UDFs (#11319) with MVC. ## Checklist - [X] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [X] New or existing tests cover these changes. - [X] The documentation is up to date with these changes. Authors: - Graham Markall (https://github.com/gmarkall) - https://github.com/brandon-b-miller - Ashwin Srinath (https://github.com/shwina) Approvers: - Ray Douglass (https://github.com/raydouglass)
rapidsai · Sep 29, 2022 · 2041caa · 2041caa
1 parent 5fad289
commit 2041caa
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 126 deletions.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -272,27 +272,15 @@ py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml
 gpuci_logger "Installing strings_udf"
 gpuci_mamba_retry install strings_udf -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}"
 
+# only install strings_udf after cuDF is finished testing without its presence
 cd "$WORKSPACE/python/strings_udf/strings_udf"
 gpuci_logger "Python py.test for strings_udf"
 
-# We do not want to exit with a nonzero exit code in the case where no
-# strings_udf tests are run because that will always happen when the local CUDA
-# version is not 11.5. We need to suppress the exit code because this script is
-# run with set -e and we're already setting a trap that we don't want to
-# override here.
-
-STRINGS_UDF_PYTEST_RETCODE=0
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests || STRINGS_UDF_PYTEST_RETCODE=$?
-
-if [ ${STRINGS_UDF_PYTEST_RETCODE} -eq 5 ]; then
-    echo "No strings UDF tests were run, but this script will continue to execute."
-elif [ ${STRINGS_UDF_PYTEST_RETCODE} -ne 0 ]; then
-    exit ${STRINGS_UDF_PYTEST_RETCODE}
-else
-    cd "$WORKSPACE/python/cudf/cudf"
-    gpuci_logger "Python py.test retest cuDF UDFs"
-    py.test tests/test_udf_masked_ops.py -n 8 --cache-clear
-fi
+# retest cudf with strings_udf present
+cd $WORKSPACE/python/cudf/cudf
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests
+gpuci_logger "Python py.test retest cuDF UDFs"
+py.test tests/test_udf_masked_ops.py -n 8 --cache-clear
 
 # Run benchmarks with both cudf and pandas to ensure compatibility is maintained.
 # Benchmarks are run in DEBUG_ONLY mode, meaning that only small data sizes are used.

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
@@ -18,7 +18,7 @@ dependencies:
   - cmake_setuptools>=0.1.3
   - scikit-build>=0.13.1
   - python>=3.8,<3.10
-  - numba>=0.54
+  - numba>=0.56.2
   - numpy
   - pandas>=1.0,<1.6.0dev0
   - pyarrow=9
@@ -82,7 +82,7 @@ dependencies:
   - pip:
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
-  - ptxcompiler  # [linux64]
+  - cubinlinker  # [linux64]
   - gcc_linux-64=9.* # [linux64]
   - sysroot_linux-64==2.17 # [linux64]
   - nvcc_linux-64=11.5

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -38,7 +38,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build>=0.13.1
     - setuptools
-    - numba >=0.54
+    - numba >=0.56.2
     - dlpack>=0.5,<0.6.0a0
     - pyarrow =9
     - libcudf ={{ version }}
@@ -50,7 +50,7 @@ requirements:
     - typing_extensions
     - pandas >=1.0,<1.6.0dev0
     - cupy >=9.5.0,<12.0.0a0
-    - numba >=0.54
+    - numba >=0.56.2
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
@@ -61,7 +61,7 @@ requirements:
     - nvtx >=0.2.1
     - packaging
     - cachetools
-    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
+    - cubinlinker  # [linux64]  # CUDA enhanced compatibility.
     - cuda-python >=11.5,<11.7.1
 test:                                   # [linux64]
   requires:                             # [linux64]

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -83,15 +83,13 @@
 from cudf.utils.utils import clear_cache, set_allocator
 
 try:
-    from ptxcompiler.patch import patch_numba_codegen_if_needed
+    from cubinlinker.patch import patch_numba_linker_if_needed
 except ImportError:
     pass
 else:
     # Patch Numba to support CUDA enhanced compatibility.
-    # See https://github.com/rapidsai/ptxcompiler for
-    # details.
-    patch_numba_codegen_if_needed()
-    del patch_numba_codegen_if_needed
+    patch_numba_linker_if_needed()
+    del patch_numba_linker_if_needed
 
 cuda.set_memory_manager(rmm.RMMNumbaManager)
 cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)

diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py
@@ -1,10 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
-import numpy as np
-from numba import cuda, types
-from numba.cuda.cudaimpl import (
-    lower as cuda_lower,
-    registry as cuda_lowering_registry,
-)
+from numba import types
+from numba.cuda.cudaimpl import lower as cuda_lower
 
 from cudf.core.dtypes import dtype
 from cudf.core.udf import api, row_function, utils
@@ -28,36 +24,31 @@
 _STRING_UDFS_ENABLED = False
 try:
     import strings_udf
-
-    if strings_udf.ENABLED:
-        from . import strings_typing  # isort: skip
-        from . import strings_lowering  # isort: skip
-        from strings_udf import ptxpath
-        from strings_udf._lib.cudf_jit_udf import to_string_view_array
-        from strings_udf._typing import str_view_arg_handler, string_view
-
-        # add an overload of MaskedType.__init__(string_view, bool)
-        cuda_lower(api.Masked, strings_typing.string_view, types.boolean)(
-            masked_lowering.masked_constructor
-        )
-
-        # add an overload of pack_return(string_view)
-        cuda_lower(api.pack_return, strings_typing.string_view)(
-            masked_lowering.pack_return_scalar_impl
-        )
-
-        _supported_masked_types |= {strings_typing.string_view}
-        utils.launch_arg_getters[dtype("O")] = to_string_view_array
-        utils.masked_array_types[dtype("O")] = string_view
-        utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
-        utils.ptx_files.append(ptxpath)
-        utils.arg_handlers.append(str_view_arg_handler)
-        row_function.itemsizes[dtype("O")] = string_view.size_bytes
-
-        _STRING_UDFS_ENABLED = True
-    else:
-        del strings_udf
-
+    from strings_udf import ptxpath
+    from strings_udf._lib.cudf_jit_udf import to_string_view_array
+    from strings_udf._typing import str_view_arg_handler, string_view
+
+    from . import strings_typing  # isort: skip
+    from . import strings_lowering  # isort: skip
+
+    cuda_lower(api.Masked, strings_typing.string_view, types.boolean)(
+        masked_lowering.masked_constructor
+    )
+
+    # add an overload of pack_return(string_view)
+    cuda_lower(api.pack_return, strings_typing.string_view)(
+        masked_lowering.pack_return_scalar_impl
+    )
+
+    _supported_masked_types |= {strings_typing.string_view}
+    utils.launch_arg_getters[dtype("O")] = to_string_view_array
+    utils.masked_array_types[dtype("O")] = string_view
+    utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
+    utils.ptx_files.append(ptxpath)
+    utils.arg_handlers.append(str_view_arg_handler)
+    row_function.itemsizes[dtype("O")] = string_view.size_bytes
+
+    _STRING_UDFS_ENABLED = True
 except ImportError as e:
     # allow cuDF to work without strings_udf
     pass

diff --git a/python/cudf/setup.py b/python/cudf/setup.py
@@ -16,7 +16,7 @@
     "cachetools",
     "cuda-python>=11.5,<11.7.1",
     "fsspec>=0.6.0",
-    "numba>=0.53.1",
+    "numba>=0.56.2",
     "numpy",
     "nvtx>=0.2.1",
     "packaging",

diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
@@ -21,7 +21,7 @@
         "numpy",
         "pandas>=1.0,<1.6.0dev0",
         "pytest",
-        "numba>=0.53.1",
+        "numba>=0.56.2",
         "dask>=2021.09.1",
         "distributed>=2021.09.1",
     ]

diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py
@@ -10,66 +10,28 @@
 
 from . import _version
 
-ENABLED = False
-
-
-def compiler_from_ptx_file(path):
-    """Parse a PTX file header and extract the CUDA version used to compile it.
-
-    Here is an example PTX header that this function should parse:
-
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-30672275
-    // Cuda compilation tools, release 11.5, V11.5.119
-    // Based on NVVM 7
-    """
-    file = open(path).read()
-    major, minor = (
-        re.search(r"Cuda compilation tools, release ([0-9\.]+)", file)
-        .group(1)
-        .split(".")
-    )
-    return int(major), int(minor)
-
+__version__ = _version.get_versions()["version"]
 
 # adapted from PTXCompiler
 cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
+
+# must have a driver to proceed
 if cp.returncode == 0:
-    # must have a driver to proceed
-    versions = [int(s) for s in cp.stdout.strip().split()]
-    driver_version = tuple(versions[:2])
-    runtime_version = tuple(versions[2:])
 
-    # CUDA enhanced compatibility not yet enabled
-    if driver_version >= runtime_version:
-        # Load the highest compute capability file available that is less than
-        # the current device's.
-        files = glob.glob(
-            os.path.join(os.path.dirname(__file__), "shim_*.ptx")
-        )
-        dev = cuda.get_current_device()
-        cc = "".join(str(x) for x in dev.compute_capability)
-        files = glob.glob(
-            os.path.join(os.path.dirname(__file__), "shim_*.ptx")
+    # Load the highest compute capability file available that is less than
+    # the current device's.
+    files = glob.glob(os.path.join(os.path.dirname(__file__), "shim_*.ptx"))
+    dev = cuda.get_current_device()
+    cc = "".join(str(x) for x in dev.compute_capability)
+    files = glob.glob(os.path.join(os.path.dirname(__file__), "shim_*.ptx"))
+    if len(files) == 0:
+        raise RuntimeError(
+            "This strings_udf installation is missing the necessary PTX "
+            "files. Please file an issue reporting this error and how you "
+            "installed cudf and strings_udf."
         )
-        if len(files) == 0:
-            raise RuntimeError(
-                "This strings_udf installation is missing the necessary PTX "
-                "files. Please file an issue reporting this error and how you "
-                "installed cudf and strings_udf."
-            )
-        sms = [
-            os.path.basename(f).rstrip(".ptx").lstrip("shim_") for f in files
-        ]
-        selected_sm = max(sm for sm in sms if sm < cc)
-        ptxpath = os.path.join(
-            os.path.dirname(__file__), f"shim_{selected_sm}.ptx"
-        )
-
-        if driver_version >= compiler_from_ptx_file(ptxpath):
-            ENABLED = True
-        else:
-            del ptxpath
-
-__version__ = _version.get_versions()["version"]
+    sms = [os.path.basename(f).rstrip(".ptx").lstrip("shim_") for f in files]
+    selected_sm = max(sm for sm in sms if sm < cc)
+    ptxpath = os.path.join(
+        os.path.dirname(__file__), f"shim_{selected_sm}.ptx"
+    )
diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py
@@ -15,9 +15,6 @@
 from strings_udf._lib.cudf_jit_udf import to_string_view_array
 from strings_udf._typing import str_view_arg_handler, string_view
 
-if not strings_udf.ENABLED:
-    pytest.skip("Strings UDF not enabled.", allow_module_level=True)
-
 
 def get_kernel(func, dtype):
     """