Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable building against the libarrow contained in pyarrow #12034

Merged
merged 22 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
59df97b
Minimal set of changes to have CMake find libarrow inside a pip wheel.
vyasr Oct 28, 2022
1efbb8c
Add TODOs indicating potential areas for improvement in the CMake code.
vyasr Oct 28, 2022
9faa72e
Minimal set of changes to get a working cuDF Python against the pyarr…
vyasr Oct 28, 2022
6a44b42
Some more notes.
vyasr Oct 28, 2022
770322d
Remove unused variable.
vyasr Oct 28, 2022
638258e
Find arrow libs using execute_process.
vyasr Oct 28, 2022
d5bb59f
Remove unnecessary nested function.
vyasr Oct 28, 2022
b9d4c7b
Make the usage of pyarrow's libarrow a CMake option and clean up some…
vyasr Oct 31, 2022
fe0aec4
Revert "Make the usage of pyarrow's libarrow a CMake option and clean…
vyasr Oct 31, 2022
7429fb7
Fix case.
vyasr Oct 31, 2022
8cf93a5
Clean up some comments.
vyasr Oct 31, 2022
cb817ba
Make building against pyarrow libarrow an option.
vyasr Nov 1, 2022
e01e458
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
79d6ce7
Remove unnecessary to_arrow change.
vyasr Nov 1, 2022
df53eae
More cleanup of get_arrow.cmake.
vyasr Nov 1, 2022
498d509
Remove one more unnecessary arrow workaround.
vyasr Nov 1, 2022
dcd2e14
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
49cfe3c
Apply cmake-format.
vyasr Nov 1, 2022
713dd2c
Address some reviews.
vyasr Nov 8, 2022
5e3f609
Next round of review.
vyasr Nov 8, 2022
2c9c7bb
Put back accidentally removed propagation of arrow vars and remove no…
vyasr Nov 8, 2022
cfd45ed
Address remaining TODOs.
vyasr Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ option(CUDA_ENABLE_LINEINFO
)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
Expand Down
188 changes: 123 additions & 65 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,98 @@

# cmake-lint: disable=R0912,R0913,R0915

include_guard(GLOBAL)

# Generate a FindArrow module for the case where we need to search for arrow within a pip install
# pyarrow.
function(find_libarrow_in_python_wheel PYARROW_VERSION)
string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})

find_package(Python REQUIRED)
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
rapids_find_generate_module(
Arrow NO_CONFIG
VERSION "${PYARROW_VERSION}"
LIBRARY_NAMES "${PYARROW_LIB}"
BUILD_EXPORT_SET cudf-exports
INSTALL_EXPORT_SET cudf-exports
HEADER_NAMES arrow/python/arrow_to_pandas.h
)

find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
add_library(arrow_shared ALIAS Arrow::Arrow)

# When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
# `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
# that these flags will often be redundant because we build wheels in manylinux containers that
# actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
# behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
# containers. Note that tests will not build successfully without also propagating these options
# to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
# NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
# this feature except for building wheels.
target_compile_options(
Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
)

rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

list(POP_BACK CMAKE_PREFIX_PATH)
endfunction()

# This function finds arrow and sets any additional necessary environment variables.
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
ENABLE_PARQUET
)

if(USE_LIBARROW_FROM_PYARROW)
# Generate a FindArrow.cmake to find pyarrow's libarrow.so
find_libarrow_in_python_wheel(${VERSION})
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
arrow_shared
PARENT_SCOPE
)
return()
endif()

if(BUILD_STATIC)
if(TARGET arrow_static)
list(APPEND ARROW_LIBRARIES arrow_static)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_static
PARENT_SCOPE
)
return()
endif()
else()
if(TARGET arrow_shared)
list(APPEND ARROW_LIBRARIES arrow_shared)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_shared
PARENT_SCOPE
)
return()
endif()
endif()

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
set(ARROW_ARMV8_ARCH "armv8-a")
endif()
Expand All @@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
set(ARROW_BUILD_STATIC ON)
set(ARROW_BUILD_SHARED OFF)
# Turn off CPM using `find_package` so we always download and make sure we get proper static
# library
set(CPM_DOWNLOAD_ALL TRUE)
# library.
set(CPM_DOWNLOAD_Arrow TRUE)
else()
set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
endif()

set(ARROW_PYTHON_OPTIONS "")
Expand All @@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB

rapids_cpm_find(
Arrow ${VERSION}
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
arrow_dataset_static
CPM_ARGS
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG apache-arrow-${VERSION}
Expand Down Expand Up @@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"xsimd_SOURCE AUTO"
)

set(ARROW_FOUND TRUE)
set(ARROW_LIBRARIES "")
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)

# Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the
# system/conda/etc.
if(Arrow_ADDED OR Arrow_DIR)
if(BUILD_STATIC)
list(APPEND ARROW_LIBRARIES arrow_static)
else()
list(APPEND ARROW_LIBRARIES arrow_shared)
endif()
if(BUILD_STATIC)
set(ARROW_LIBRARIES arrow_static)
else()
set(ARROW_LIBRARIES arrow_shared)
endif()

if(Arrow_DIR)
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
# Set this to enable `find_package(ArrowDataset)`
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
# Arrow_DIR: set if CPM found Arrow on the system/conda/etc.
if(Arrow_DIR)
# This extra find_package is necessary because rapids_cpm_find does not propagate all the
# variables from find_package that we might need. This is especially problematic when
# rapids_cpm_find builds from source.
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
# Setting Parquet_DIR is conditional because parquet may be installed independently of arrow.
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
# Set this to enable `find_package(ArrowDataset)`
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
endif()
# Arrow_ADDED: set if CPM downloaded Arrow from Github
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
endif()
#
# This shouldn't be necessary!
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
endif()
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
else()
set(ARROW_FOUND FALSE)
set(ARROW_FOUND
FALSE
PARENT_SCOPE
)
message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
endif()

Expand Down Expand Up @@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
endif()

set(ARROW_FOUND
"${ARROW_FOUND}"
PARENT_SCOPE
)
set(ARROW_LIBRARIES
"${ARROW_LIBRARIES}"
PARENT_SCOPE
)

endfunction()

if(NOT DEFINED CUDF_VERSION_Arrow)
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,25 @@ project(
option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
OFF
)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

# If the user requested it we attempt to find CUDF.
if(FIND_CUDF_CPP)
if(USE_LIBARROW_FROM_PYARROW)
# We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO:
# These options should probably all become optional since in practice they aren't meaningful
# except in the case where we actually compile Arrow.
set(CUDF_USE_ARROW_STATIC OFF)
set(CUDF_ENABLE_ARROW_S3 OFF)
set(CUDF_ENABLE_ARROW_ORC OFF)
set(CUDF_ENABLE_ARROW_PYTHON OFF)
set(CUDF_ENABLE_ARROW_PARQUET OFF)
include(rapids-find)
include(rapids-export)
include(../../cpp/cmake/thirdparty/get_arrow.cmake)
endif()

find_package(cudf ${cudf_version} REQUIRED)
else()
set(cudf_FOUND OFF)
Expand Down