Skip to content

Commit

Permalink
Enable building against the libarrow contained in pyarrow (#12034)
Browse files Browse the repository at this point in the history
This feature is a prerequisite for wheels. There is no real good reason to do this except to provide interop with a pyarrow wheel, so this option is marked as advanced. In the process of implementing this feature, I have also done some cleanup of `get_arrow.cmake` to try and simplify its logic.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Paul Taylor (https://github.com/trxcllnt)
  - Robert Maynard (https://github.com/robertmaynard)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #12034
  • Loading branch information
vyasr authored Nov 8, 2022
1 parent c900fed commit 8ee5f51
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 65 deletions.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ option(CUDA_ENABLE_LINEINFO
option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
Expand Down
188 changes: 123 additions & 65 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,98 @@

# cmake-lint: disable=R0912,R0913,R0915

include_guard(GLOBAL)

# Generate a FindArrow module for the case where we need to search for arrow within a pip install
# pyarrow.
function(find_libarrow_in_python_wheel PYARROW_VERSION)
string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})

find_package(Python REQUIRED)
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
rapids_find_generate_module(
Arrow NO_CONFIG
VERSION "${PYARROW_VERSION}"
LIBRARY_NAMES "${PYARROW_LIB}"
BUILD_EXPORT_SET cudf-exports
INSTALL_EXPORT_SET cudf-exports
HEADER_NAMES arrow/python/arrow_to_pandas.h
)

find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
add_library(arrow_shared ALIAS Arrow::Arrow)

# When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
# `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
# that these flags will often be redundant because we build wheels in manylinux containers that
# actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
# behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
# containers. Note that tests will not build successfully without also propagating these options
# to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
# NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
# this feature except for building wheels.
target_compile_options(
Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
)

rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

list(POP_BACK CMAKE_PREFIX_PATH)
endfunction()

# This function finds arrow and sets any additional necessary environment variables.
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
ENABLE_PARQUET
)

if(USE_LIBARROW_FROM_PYARROW)
# Generate a FindArrow.cmake to find pyarrow's libarrow.so
find_libarrow_in_python_wheel(${VERSION})
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
arrow_shared
PARENT_SCOPE
)
return()
endif()

if(BUILD_STATIC)
if(TARGET arrow_static)
list(APPEND ARROW_LIBRARIES arrow_static)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_static
PARENT_SCOPE
)
return()
endif()
else()
if(TARGET arrow_shared)
list(APPEND ARROW_LIBRARIES arrow_shared)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_shared
PARENT_SCOPE
)
return()
endif()
endif()

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
set(ARROW_ARMV8_ARCH "armv8-a")
endif()
Expand All @@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
set(ARROW_BUILD_STATIC ON)
set(ARROW_BUILD_SHARED OFF)
# Turn off CPM using `find_package` so we always download and make sure we get proper static
# library
set(CPM_DOWNLOAD_ALL TRUE)
# library.
set(CPM_DOWNLOAD_Arrow TRUE)
else()
set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
endif()

set(ARROW_PYTHON_OPTIONS "")
Expand All @@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB

rapids_cpm_find(
Arrow ${VERSION}
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
arrow_dataset_static
CPM_ARGS
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG apache-arrow-${VERSION}
Expand Down Expand Up @@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"xsimd_SOURCE AUTO"
)

set(ARROW_FOUND TRUE)
set(ARROW_LIBRARIES "")
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)

# Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the
# system/conda/etc.
if(Arrow_ADDED OR Arrow_DIR)
if(BUILD_STATIC)
list(APPEND ARROW_LIBRARIES arrow_static)
else()
list(APPEND ARROW_LIBRARIES arrow_shared)
endif()
if(BUILD_STATIC)
set(ARROW_LIBRARIES arrow_static)
else()
set(ARROW_LIBRARIES arrow_shared)
endif()

if(Arrow_DIR)
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
# Set this to enable `find_package(ArrowDataset)`
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
# Arrow_DIR: set if CPM found Arrow on the system/conda/etc.
if(Arrow_DIR)
# This extra find_package is necessary because rapids_cpm_find does not propagate all the
# variables from find_package that we might need. This is especially problematic when
# rapids_cpm_find builds from source.
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
# Setting Parquet_DIR is conditional because parquet may be installed independently of arrow.
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
# Set this to enable `find_package(ArrowDataset)`
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
endif()
# Arrow_ADDED: set if CPM downloaded Arrow from Github
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
endif()
#
# This shouldn't be necessary!
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
endif()
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
else()
set(ARROW_FOUND FALSE)
set(ARROW_FOUND
FALSE
PARENT_SCOPE
)
message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
endif()

Expand Down Expand Up @@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
endif()

set(ARROW_FOUND
"${ARROW_FOUND}"
PARENT_SCOPE
)
set(ARROW_LIBRARIES
"${ARROW_LIBRARIES}"
PARENT_SCOPE
)

endfunction()

if(NOT DEFINED CUDF_VERSION_Arrow)
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,25 @@ project(
option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
OFF
)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

# If the user requested it we attempt to find CUDF.
if(FIND_CUDF_CPP)
if(USE_LIBARROW_FROM_PYARROW)
# We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO:
# These options should probably all become optional since in practice they aren't meaningful
# except in the case where we actually compile Arrow.
set(CUDF_USE_ARROW_STATIC OFF)
set(CUDF_ENABLE_ARROW_S3 OFF)
set(CUDF_ENABLE_ARROW_ORC OFF)
set(CUDF_ENABLE_ARROW_PYTHON OFF)
set(CUDF_ENABLE_ARROW_PARQUET OFF)
include(rapids-find)
include(rapids-export)
include(../../cpp/cmake/thirdparty/get_arrow.cmake)
endif()

find_package(cudf ${cudf_version} REQUIRED)
else()
set(cudf_FOUND OFF)
Expand Down

0 comments on commit 8ee5f51

Please sign in to comment.