Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable building against the libarrow contained in pyarrow #12034

Merged
merged 22 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
59df97b
Minimal set of changes to have CMake find libarrow inside a pip wheel.
vyasr Oct 28, 2022
1efbb8c
Add TODOs indicating potential areas for improvement in the CMake code.
vyasr Oct 28, 2022
9faa72e
Minimal set of changes to get a working cuDF Python against the pyarr…
vyasr Oct 28, 2022
6a44b42
Some more notes.
vyasr Oct 28, 2022
770322d
Remove unused variable.
vyasr Oct 28, 2022
638258e
Find arrow libs using execute_process.
vyasr Oct 28, 2022
d5bb59f
Remove unnecessary nested function.
vyasr Oct 28, 2022
b9d4c7b
Make the usage of pyarrow's libarrow a CMake option and clean up some…
vyasr Oct 31, 2022
fe0aec4
Revert "Make the usage of pyarrow's libarrow a CMake option and clean…
vyasr Oct 31, 2022
7429fb7
Fix case.
vyasr Oct 31, 2022
8cf93a5
Clean up some comments.
vyasr Oct 31, 2022
cb817ba
Make building against pyarrow libarrow an option.
vyasr Nov 1, 2022
e01e458
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
79d6ce7
Remove unnecessary to_arrow change.
vyasr Nov 1, 2022
df53eae
More cleanup of get_arrow.cmake.
vyasr Nov 1, 2022
498d509
Remove one more unnecessary arrow workaround.
vyasr Nov 1, 2022
dcd2e14
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
49cfe3c
Apply cmake-format.
vyasr Nov 1, 2022
713dd2c
Address some reviews.
vyasr Nov 8, 2022
5e3f609
Next round of review.
vyasr Nov 8, 2022
2c9c7bb
Put back accidentally removed propagation of arrow vars and remove no…
vyasr Nov 8, 2022
cfd45ed
Address remaining TODOs.
vyasr Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ option(CUDA_ENABLE_LINEINFO
)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
Expand Down
226 changes: 156 additions & 70 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,109 @@

# cmake-lint: disable=R0912,R0913,R0915

include_guard(GLOBAL)

# Generate a FindArrow module for the case where we need to search for arrow within a pip install
# pyarrow.
function(find_libarrow_in_python_wheel PYARROW_VERSION)
string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})

find_package(Python REQUIRED)
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
rapids_find_generate_module(
Arrow NO_CONFIG
VERSION "${PYARROW_VERSION}"
LIBRARY_NAMES "${PYARROW_LIB}"
BUILD_EXPORT_SET cudf-exports
INSTALL_EXPORT_SET cudf-exports
HEADER_NAMES arrow/python/arrow_to_pandas.h
)

find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
add_library(arrow_shared ALIAS Arrow::Arrow)

# TODO: Should these both be here? I think so, the `rapids_find_generate_module` doesn't seems to
# create the necessary `find_dependency` calls in the config files.
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

list(POP_BACK CMAKE_PREFIX_PATH)
endfunction()

# This function finds arrow and sets any additional necessary environment variables.
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
ENABLE_PARQUET
)

if(USE_LIBARROW_FROM_PYARROW)
# Generate a FindArrow.cmake to find pyarrow's libarrow.so
find_libarrow_in_python_wheel(${VERSION})
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
arrow_shared
PARENT_SCOPE
)

# When using the libarrow inside a wheel we must build libcudf with the old ABI because
# pyarrow's `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old
vyasr marked this conversation as resolved.
Show resolved Hide resolved
# ABI. Note that these flags will often be redundant because we build wheels in manylinux
# containers that actually have the old libc++ anyway, but setting them explicitly ensures
# correct and consistent behavior in all other cases such as aarch builds on newer manylinux or
# testing builds in newer containers. TODO: Tests will not build successfully now without also
# propagating these options to builds of GTest. Similarly, benchmarks will not work without
# updating GBench (and possibly NVBench) builds. Do we care about that since we don't anticipate
vyasr marked this conversation as resolved.
Show resolved Hide resolved
# using this feature except when building wheels?
list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0)
list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0)
set(CUDF_CXX_FLAGS
"${CUDF_CXX_FLAGS}"
PARENT_SCOPE
)
set(CUDF_CUDA_FLAGS
"${CUDF_CUDA_FLAGS}"
PARENT_SCOPE
)

return()
endif()

# TODO: How would the targets have been created before this function is called? Is this just to
# ensure that the function is idempotent? If so, why do we need to set these variables again?
vyasr marked this conversation as resolved.
Show resolved Hide resolved
if(BUILD_STATIC)
if(TARGET arrow_static)
list(APPEND ARROW_LIBRARIES arrow_static)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_static
PARENT_SCOPE
)
return()
endif()
else()
if(TARGET arrow_shared)
list(APPEND ARROW_LIBRARIES arrow_shared)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
arrow_shared
PARENT_SCOPE
)
return()
endif()
endif()

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
set(ARROW_ARMV8_ARCH "armv8-a")
endif()
Expand All @@ -69,8 +135,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
set(ARROW_BUILD_STATIC ON)
set(ARROW_BUILD_SHARED OFF)
# Turn off CPM using `find_package` so we always download and make sure we get proper static
# library
# library TODO: Can we use `CPM_DOWNLOAD_<PackageName>` (aka `CPM_DOWNLOAD_ARROW`) instead? This
# is a big hammer, although I guess it is effectively limited which packages will be affected by
# the scope of the variable here.
vyasr marked this conversation as resolved.
Show resolved Hide resolved
set(CPM_DOWNLOAD_ALL TRUE)
else()
set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
endif()

set(ARROW_PYTHON_OPTIONS "")
Expand All @@ -90,8 +161,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
endif()

rapids_cpm_find(
Arrow ${VERSION}
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
Arrow
${VERSION}
# TODO: Should we set the list of global targets conditionally based on whether we're building
# shared/static and whether parquet is enabled?
vyasr marked this conversation as resolved.
Show resolved Hide resolved
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
arrow_dataset_static
CPM_ARGS
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG apache-arrow-${VERSION}
Expand Down Expand Up @@ -125,61 +200,73 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"xsimd_SOURCE AUTO"
)

set(ARROW_FOUND TRUE)
set(ARROW_LIBRARIES "")
# TODO: Will this not get set by rapids_cpm_find?
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)

# Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the
# system/conda/etc.
if(Arrow_ADDED OR Arrow_DIR)
if(BUILD_STATIC)
list(APPEND ARROW_LIBRARIES arrow_static)
else()
list(APPEND ARROW_LIBRARIES arrow_shared)
endif()
if(BUILD_STATIC)
set(ARROW_LIBRARIES
vyasr marked this conversation as resolved.
Show resolved Hide resolved
arrow_static
PARENT_SCOPE
)
else()
set(ARROW_LIBRARIES
arrow_shared
PARENT_SCOPE
)
endif()

if(Arrow_DIR)
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
# Set this to enable `find_package(ArrowDataset)`
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
# Arrow_DIR: set if CPM found Arrow on the system/conda/etc.
if(Arrow_DIR)
# TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`??
vyasr marked this conversation as resolved.
Show resolved Hide resolved
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
if(NOT Parquet_DIR)
# Set this to enable `find_package(Parquet)`
set(Parquet_DIR "${Arrow_DIR}")
endif()
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
# Set this to enable `find_package(ArrowDataset)` TODO: Should this be conditional like the
# Parquet_DIR setting above?
set(ArrowDataset_DIR "${Arrow_DIR}")
find_package(ArrowDataset REQUIRED QUIET)
endif()
# Arrow_ADDED: set if CPM downloaded Arrow from Github
elseif(Arrow_ADDED)
# Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
# target_include_directories. That defeats ccache.
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
)
endif()
#
# This shouldn't be necessary!
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
endif()
#
# This shouldn't be necessary!
vyasr marked this conversation as resolved.
Show resolved Hide resolved
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
# targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
#
# This only works because we know exactly which components we're using. Don't forget to update
# this list if we add more!
#
foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
target_include_directories(
${ARROW_LIBRARY}
INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
"$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
)
endforeach()
else()
set(ARROW_FOUND FALSE)
set(ARROW_FOUND
FALSE
PARENT_SCOPE
)
message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
endif()

Expand Down Expand Up @@ -230,6 +317,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
BUILD Arrow
VERSION ${VERSION}
EXPORT_SET arrow_targets
# TODO: What about all the other GLOBAL_TARGETS specified in the `rapids_cpm_find` command?
# Also, should this be constructed conditionally based on whether we're using shared or static
# libs for arrow?
vyasr marked this conversation as resolved.
Show resolved Hide resolved
GLOBAL_TARGETS arrow_shared arrow_static
NAMESPACE cudf::
FINAL_CODE_BLOCK arrow_code_string
Expand Down Expand Up @@ -279,6 +369,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
endif()
endif()
# We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
# TODO: Isn't the second call here redundant? The above `rapids_export` command will also encode a
# `find_dependency` in the INSTALL export set. It's less obvious whether we should still include
# the first `rapids_export_package` below since that will translate to a `find_dependency` call in
# the build config and `rapids-export` will put a `CPMFindPackage` there instead; is our goal to
# add `find_dependency`? Since it'll happen after `CPMFindPackage` I think it'll just be
# redundant.
vyasr marked this conversation as resolved.
Show resolved Hide resolved
rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

Expand All @@ -293,16 +389,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
endif()

set(ARROW_FOUND
"${ARROW_FOUND}"
PARENT_SCOPE
)
set(ARROW_LIBRARIES
"${ARROW_LIBRARIES}"
PARENT_SCOPE
)

endfunction()

if(NOT DEFINED CUDF_VERSION_Arrow)
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,25 @@ project(
option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
OFF
)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

# If the user requested it we attempt to find CUDF.
if(FIND_CUDF_CPP)
if(USE_LIBARROW_FROM_PYARROW)
# We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO:
# These options should probably all become optional since in practice they aren't meaningful
# except in the case where we actually compile Arrow.
set(CUDF_USE_ARROW_STATIC OFF)
set(CUDF_ENABLE_ARROW_S3 OFF)
set(CUDF_ENABLE_ARROW_ORC OFF)
set(CUDF_ENABLE_ARROW_PYTHON OFF)
set(CUDF_ENABLE_ARROW_PARQUET OFF)
include(rapids-find)
include(rapids-export)
include(../../cpp/cmake/thirdparty/get_arrow.cmake)
endif()

find_package(cudf ${cudf_version} REQUIRED)
else()
set(cudf_FOUND OFF)
Expand Down Expand Up @@ -62,6 +78,14 @@ if(NOT cudf_FOUND)
install(TARGETS cudf DESTINATION ${cython_lib_dir})
endif()

if(USE_LIBARROW_FROM_PYARROW)
# TODO: It would be nice to avoid setting these globally, but we'd need to manually set this for
# each of the rapids-cython targets otherwise which seems undesirable. For now I'm setting this
# after the `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough.
vyasr marked this conversation as resolved.
Show resolved Hide resolved
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0")
endif()

rapids_cython_init()

add_subdirectory(cudf/_lib)
Expand Down