From 8ee5f51f971c4994694f34bb52524540f807f7fc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Nov 2022 14:29:35 -0800 Subject: [PATCH] Enable building against the libarrow contained in pyarrow (#12034) This feature is a prerequisite for wheels. There is no real good reason to do this except to provide interop with a pyarrow wheel, so this option is marked as advanced. In the process of implementing this feature, I have also done some cleanup of `get_arrow.cmake` to try and simplify its logic. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Paul Taylor (https://github.com/trxcllnt) - Robert Maynard (https://github.com/robertmaynard) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12034 --- cpp/CMakeLists.txt | 2 + cpp/cmake/thirdparty/get_arrow.cmake | 188 ++++++++++++++++++--------- python/cudf/CMakeLists.txt | 16 +++ 3 files changed, 141 insertions(+), 65 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 75de15bdf22..e13b1747a7e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -69,6 +69,8 @@ option(CUDA_ENABLE_LINEINFO option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 9fa5b9d1658..94dcdcb5bc2 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -20,43 +20,98 @@ # cmake-lint: disable=R0912,R0913,R0915 +include_guard(GLOBAL) + +# Generate a FindArrow module for the case where we need to search for arrow within a pip install +# pyarrow. +function(find_libarrow_in_python_wheel PYARROW_VERSION) + string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}") + set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER}) + + find_package(Python REQUIRED) + execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") + rapids_find_generate_module( + Arrow NO_CONFIG + VERSION "${PYARROW_VERSION}" + LIBRARY_NAMES "${PYARROW_LIB}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h + ) + + find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) + add_library(arrow_shared ALIAS Arrow::Arrow) + + # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's + # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note + # that these flags will often be redundant because we build wheels in manylinux containers that + # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent + # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer + # containers. Note that tests will not build successfully without also propagating these options + # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly + # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using + # this feature except for building wheels. + target_compile_options( + Arrow::Arrow INTERFACE "$<$:-D_GLIBCXX_USE_CXX11_ABI=0>" + "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" + ) + + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + list(POP_BACK CMAKE_PREFIX_PATH) +endfunction() + # This function finds arrow and sets any additional necessary environment variables. function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET ) + if(USE_LIBARROW_FROM_PYARROW) + # Generate a FindArrow.cmake to find pyarrow's libarrow.so + find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + return() + endif() + if(BUILD_STATIC) if(TARGET arrow_static) - list(APPEND ARROW_LIBRARIES arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE ) set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} + arrow_static PARENT_SCOPE ) return() endif() else() if(TARGET arrow_shared) - list(APPEND ARROW_LIBRARIES arrow_shared) set(ARROW_FOUND TRUE PARENT_SCOPE ) set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} + arrow_shared PARENT_SCOPE ) return() endif() endif() - set(ARROW_BUILD_SHARED ON) - set(ARROW_BUILD_STATIC OFF) - set(CPMAddOrFindPackage CPMFindPackage) - if(NOT ARROW_ARMV8_ARCH) set(ARROW_ARMV8_ARCH "armv8-a") endif() @@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library - set(CPM_DOWNLOAD_ALL TRUE) + # library. + set(CPM_DOWNLOAD_Arrow TRUE) + else() + set(ARROW_BUILD_SHARED ON) + set(ARROW_BUILD_STATIC OFF) endif() set(ARROW_PYTHON_OPTIONS "") @@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared + GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static + arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "xsimd_SOURCE AUTO" ) - set(ARROW_FOUND TRUE) - set(ARROW_LIBRARIES "") + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) - # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the - # system/conda/etc. - if(Arrow_ADDED OR Arrow_DIR) - if(BUILD_STATIC) - list(APPEND ARROW_LIBRARIES arrow_static) - else() - list(APPEND ARROW_LIBRARIES arrow_shared) - endif() + if(BUILD_STATIC) + set(ARROW_LIBRARIES arrow_static) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() - if(Arrow_DIR) - find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)` - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. + if(Arrow_DIR) + # This extra find_package is necessary because rapids_cpm_find does not propagate all the + # variables from find_package that we might need. This is especially problematic when + # rapids_cpm_find builds from source. + find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") endif() - elseif(Arrow_ADDED) - # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to - # target_include_directories. That defeats ccache. - file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + # Set this to enable `find_package(ArrowDataset)` + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) + endif() + # Arrow_ADDED: set if CPM downloaded Arrow from Github + elseif(Arrow_ADDED) + # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to + # target_include_directories. That defeats ccache. + file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + ) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() - # - # This shouldn't be necessary! - # - # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` - # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. - # - # This only works because we know exactly which components we're using. Don't forget to update - # this list if we add more! - # - foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) - target_include_directories( - ${ARROW_LIBRARY} - INTERFACE "$" - "$" - "$" - "$" - ) - endforeach() endif() + # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` + # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. + # + # This only works because we know exactly which components we're using. Don't forget to update + # this list if we add more! + # + foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) + target_include_directories( + ${ARROW_LIBRARY} + INTERFACE "$" + "$" + "$" + "$" + ) + endforeach() else() - set(ARROW_FOUND FALSE) + set(ARROW_FOUND + FALSE + PARENT_SCOPE + ) message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") endif() @@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) endif() - set(ARROW_FOUND - "${ARROW_FOUND}" - PARENT_SCOPE - ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE ) - endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index f8eb3af86d7..8a3224237b6 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -31,9 +31,25 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) + if(USE_LIBARROW_FROM_PYARROW) + # We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO: + # These options should probably all become optional since in practice they aren't meaningful + # except in the case where we actually compile Arrow. + set(CUDF_USE_ARROW_STATIC OFF) + set(CUDF_ENABLE_ARROW_S3 OFF) + set(CUDF_ENABLE_ARROW_ORC OFF) + set(CUDF_ENABLE_ARROW_PYTHON OFF) + set(CUDF_ENABLE_ARROW_PARQUET OFF) + include(rapids-find) + include(rapids-export) + include(../../cpp/cmake/thirdparty/get_arrow.cmake) + endif() + find_package(cudf ${cudf_version} REQUIRED) else() set(cudf_FOUND OFF)