Support arbitrary CUDA versions in UDF code (#15950)

This PR eliminates the manual mapping from PTX versions to CUDA versions, to help support CUDA 12.5 and newer without requiring a manual update to `_numba.py` for every CUDA release. This also updates the minimum compute capability PTX file from arch 60 to arch 70, since that is now the minimum required by RAPIDS. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Graham Markall (https://github.com/gmarkall) - https://github.com/brandon-b-miller URL: #15950
rapidsai · Jun 10, 2024 · 9b2c35f · 9b2c35f
1 parent ae12634
commit 9b2c35f
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 73 deletions.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
@@ -18,6 +18,7 @@ cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
 
 cdef extern from "cudf/strings/udf/udf_apis.hpp"  namespace \
         "cudf::strings::udf" nogil:
+    cdef int get_cuda_build_version() except +
     cdef unique_ptr[device_buffer] to_string_view_array(column_view) except +
     cdef unique_ptr[column] column_from_udf_string_array(
         udf_string* strings, size_type size,

diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -22,11 +22,16 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
 from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
+    get_cuda_build_version as cpp_get_cuda_build_version,
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
 
 
+def get_cuda_build_version():
+    return cpp_get_cuda_build_version()
+
+
 def column_to_string_view_array(Column strings_col):
     cdef unique_ptr[device_buffer] c_buffer
     cdef column_view input_view = strings_col.view()

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
@@ -12,16 +12,14 @@
 # strings_udf. This is the easiest way to break an otherwise circular import
 # loop of _lib.*->cudautils->_numba->_lib.strings_udf
 @lru_cache
-def _get_cc_60_ptx_file():
+def _get_cuda_build_version():
     from cudf._lib import strings_udf
 
-    return os.path.join(
-        os.path.dirname(strings_udf.__file__),
-        "..",
-        "core",
-        "udf",
-        "shim_60.ptx",
-    )
+    # The version is an integer, parsed as 1000 * major + 10 * minor
+    cuda_build_version = strings_udf.get_cuda_build_version()
+    cuda_major_version = cuda_build_version // 1000
+    cuda_minor_version = (cuda_build_version % 1000) // 10
+    return (cuda_major_version, cuda_minor_version)
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
@@ -38,8 +36,8 @@ def _get_best_ptx_file(archs, max_compute_capability):
 
 def _get_ptx_file(path, prefix):
     if "RAPIDS_NO_INITIALIZE" in os.environ:
-        # cc=60 ptx is always built
-        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
+        # cc=70 ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "70"))
     else:
         from numba import cuda
 
@@ -120,15 +118,13 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(
-            _get_cc_60_ptx_file()
-        )
+        shim_ptx_cuda_version = _get_cuda_build_version()
 
         # MVC is required whenever any PTX is newer than the driver
-        # This could be the shipped PTX file or the PTX emitted by
-        # the version of NVVM on the user system, the latter aligning
-        # with the runtime version
-        if (driver_version < ptx_toolkit_version) or (
+        # This could be the shipped shim PTX file (determined by the CUDA
+        # version used at build time) or the PTX emitted by the version of NVVM
+        # on the user system (determined by the user's CUDA runtime version)
+        if (driver_version < shim_ptx_cuda_version) or (
             driver_version < runtime_version
         ):
             if driver_version < (12, 0):
@@ -139,60 +135,6 @@ def _setup_numba():
                 patch_numba_linker()
 
 
-def _get_cuda_version_from_ptx_file(path):
-    """
-    https://docs.nvidia.com/cuda/parallel-thread-execution/
-    Each PTX module must begin with a .version
-    directive specifying the PTX language version
-
-    example header:
-    //
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-31057947
-    // Cuda compilation tools, release 11.6, V11.6.124
-    // Based on NVVM 7.0.1
-    //
-
-    .version 7.6
-    .target sm_52
-    .address_size 64
-
-    """
-    with open(path) as ptx_file:
-        for line in ptx_file:
-            if line.startswith(".version"):
-                ver_line = line
-                break
-        else:
-            raise ValueError("Could not read CUDA version from ptx file.")
-    version = ver_line.strip("\n").split(" ")[1]
-    # This dictionary maps from supported versions of NVVM to the
-    # PTX version it produces. The lowest value should be the minimum
-    # CUDA version required to compile the library. Currently CUDA 11.5
-    # or higher is required to build cudf. New CUDA versions should
-    # be added to this dictionary when officially supported.
-    ver_map = {
-        "7.5": (11, 5),
-        "7.6": (11, 6),
-        "7.7": (11, 7),
-        "7.8": (11, 8),
-        "8.0": (12, 0),
-        "8.1": (12, 1),
-        "8.2": (12, 2),
-        "8.3": (12, 3),
-        "8.4": (12, 4),
-    }
-
-    cuda_ver = ver_map.get(version)
-    if cuda_ver is None:
-        raise ValueError(
-            f"Could not map PTX version {version} to a CUDA version"
-        )
-
-    return cuda_ver
-
-
 class _CUDFNumbaConfig:
     def __enter__(self):
         self.CUDA_LOW_OCCUPANCY_WARNINGS = (

diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
@@ -60,7 +60,7 @@ set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true)
 
 # always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be
 # safely queried through a context
-list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60")
+list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70")
 
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "")
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "")

diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,13 @@ namespace cudf {
 namespace strings {
 namespace udf {
 
+/**
+ * @brief Get the CUDA version used at build time.
+ *
+ * @return The CUDA version as an integer, parsed as major * 1000 + minor * 10.
+ */
+int get_cuda_build_version();
+
 class udf_string;
 
 /**

diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -101,6 +101,8 @@ void free_udf_string_array(cudf::strings::udf::udf_string* d_strings,
 
 // external APIs
 
+int get_cuda_build_version() { return CUDA_VERSION; }
+
 std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const input)
 {
   return detail::to_string_view_array(input, cudf::get_default_stream());