Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable building against the libarrow contained in pyarrow #12034

Merged
merged 22 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
59df97b
Minimal set of changes to have CMake find libarrow inside a pip wheel.
vyasr Oct 28, 2022
1efbb8c
Add TODOs indicating potential areas for improvement in the CMake code.
vyasr Oct 28, 2022
9faa72e
Minimal set of changes to get a working cuDF Python against the pyarr…
vyasr Oct 28, 2022
6a44b42
Some more notes.
vyasr Oct 28, 2022
770322d
Remove unused variable.
vyasr Oct 28, 2022
638258e
Find arrow libs using execute_process.
vyasr Oct 28, 2022
d5bb59f
Remove unnecessary nested function.
vyasr Oct 28, 2022
b9d4c7b
Make the usage of pyarrow's libarrow a CMake option and clean up some…
vyasr Oct 31, 2022
fe0aec4
Revert "Make the usage of pyarrow's libarrow a CMake option and clean…
vyasr Oct 31, 2022
7429fb7
Fix case.
vyasr Oct 31, 2022
8cf93a5
Clean up some comments.
vyasr Oct 31, 2022
cb817ba
Make building against pyarrow libarrow an option.
vyasr Nov 1, 2022
e01e458
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
79d6ce7
Remove unnecessary to_arrow change.
vyasr Nov 1, 2022
df53eae
More cleanup of get_arrow.cmake.
vyasr Nov 1, 2022
498d509
Remove one more unnecessary arrow workaround.
vyasr Nov 1, 2022
dcd2e14
Merge remote-tracking branch 'origin/branch-22.12' into feature/build…
vyasr Nov 1, 2022
49cfe3c
Apply cmake-format.
vyasr Nov 1, 2022
713dd2c
Address some reviews.
vyasr Nov 8, 2022
5e3f609
Next round of review.
vyasr Nov 8, 2022
2c9c7bb
Put back accidentally removed propagation of arrow vars and remove no…
vyasr Nov 8, 2022
cfd45ed
Address remaining TODOs.
vyasr Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ option(CUDA_ENABLE_LINEINFO
)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
Expand Down
125 changes: 100 additions & 25 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,92 @@

# cmake-lint: disable=R0912,R0913,R0915

include_guard(GLOBAL)

# Generate a FindArrow module for the case where we need to search for arrow
# within a pip install pyarrow.
function(find_libarrow_in_python_wheel PYARROW_VERSION)
string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})

find_package(Python REQUIRED)
execute_process(
COMMAND "${Python_EXECUTABLE}"
-c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
rapids_find_generate_module(
Arrow
NO_CONFIG
VERSION "${PYARROW_VERSION}"
LIBRARY_NAMES "${PYARROW_LIB}"
BUILD_EXPORT_SET cudf-exports
INSTALL_EXPORT_SET cudf-exports
HEADER_NAMES arrow/python/arrow_to_pandas.h
)

find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
add_library(arrow_shared ALIAS Arrow::Arrow)

# TODO: Should these both be here? I think so, the
# `rapids_find_generate_module` doesn't seems to create the necessary
# `find_dependency` calls in the config files.
rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

list(POP_BACK CMAKE_PREFIX_PATH)
endfunction()

# This function finds arrow and sets any additional necessary environment variables.
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
ENABLE_PARQUET
)

if(USE_LIBARROW_FROM_PYARROW)
# Generate a FindArrow.cmake to find pyarrow's libarrow.so
find_libarrow_in_python_wheel(${VERSION})
set(ARROW_FOUND TRUE PARENT_SCOPE)
set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE)

# When using the libarrow inside a wheel we must build libcudf with the old
# ABI because pyarrow's `libarrow.so` is compiled for manylinux2014
# (centos7 toolchain) which uses the old ABI. Note that these flags will
# often be redundant because we build wheels in manylinux containers that
# actually have the old libc++ anyway, but setting them explicitly ensures
# correct and consistent behavior in all other cases such as aarch builds
# on newer manylinux or testing builds in newer containers.
# TODO: Tests will not build successfully now without also propagating
# these options to builds of GTest. Similarly, benchmarks will not work
# without updating GBench (and possibly NVBench) builds. Do we care about
# that since we don't anticipate using this feature except when building
# wheels?
list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0)
list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0)
set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE)
set(CUDF_CUDA_FLAGS "${CUDF_CUDA_FLAGS}" PARENT_SCOPE)

return()
endif()

# TODO: How would the targets have been created before this function is
# called? Is this just to ensure that the function is idempotent? If so, why
# do we need to set these variables again?
if(BUILD_STATIC)
if(TARGET arrow_static)
list(APPEND ARROW_LIBRARIES arrow_static)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
PARENT_SCOPE
)
set(ARROW_FOUND TRUE PARENT_SCOPE)
set(ARROW_LIBRARIES arrow_static PARENT_SCOPE)
return()
endif()
else()
if(TARGET arrow_shared)
list(APPEND ARROW_LIBRARIES arrow_shared)
set(ARROW_FOUND
TRUE
PARENT_SCOPE
)
set(ARROW_LIBRARIES
${ARROW_LIBRARIES}
PARENT_SCOPE
)
set(ARROW_FOUND TRUE PARENT_SCOPE)
set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE)
return()
endif()
endif()

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
set(ARROW_ARMV8_ARCH "armv8-a")
endif()
Expand All @@ -70,7 +119,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
set(ARROW_BUILD_SHARED OFF)
# Turn off CPM using `find_package` so we always download and make sure we get proper static
# library
# TODO: Can we use `CPM_DOWNLOAD_<PackageName>` (aka `CPM_DOWNLOAD_ARROW`)
# instead? This is a big hammer, although I guess it is effectively limited
# which packages will be affected by the scope of the variable here.
set(CPM_DOWNLOAD_ALL TRUE)
else()
set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
endif()

set(ARROW_PYTHON_OPTIONS "")
Expand All @@ -91,7 +146,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB

rapids_cpm_find(
Arrow ${VERSION}
# TODO: Should we set the list of global targets conditionally based on
# whether we're building shared/static and whether parquet is enabled?
GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
arrow_static parquet_static arrow_dataset_static
CPM_ARGS
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG apache-arrow-${VERSION}
Expand Down Expand Up @@ -125,19 +183,25 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"xsimd_SOURCE AUTO"
)

# TODO: Will this not get set by rapids_cpm_find?
set(ARROW_FOUND TRUE)
set(ARROW_LIBRARIES "")

# Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the
# system/conda/etc.
# Arrow_ADDED: set if CPM downloaded Arrow from Github
# Arrow_DIR: set if CPM found Arrow on the system/conda/etc.
if(Arrow_ADDED OR Arrow_DIR)
# TODO: We can set the ARROW_LIBRARIES in a single place based on
# BUILD_STATIC at the very beginning of this function and then get rid of
# the if directly above this one. Then this whole if/else becomes:
# if(Arrow_DIR)... elseif(Arrow_ADDED)...else() message(FATAL_ERROR)
if(BUILD_STATIC)
list(APPEND ARROW_LIBRARIES arrow_static)
else()
list(APPEND ARROW_LIBRARIES arrow_shared)
endif()

if(Arrow_DIR)
# TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`??
find_package(Arrow REQUIRED QUIET)
if(ENABLE_PARQUET)
if(NOT Parquet_DIR)
Expand Down Expand Up @@ -183,6 +247,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
endif()

# TODO: This can go into the block above for Arrow_ADDED
if(Arrow_ADDED)

set(arrow_code_string
Expand Down Expand Up @@ -230,6 +295,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
BUILD Arrow
VERSION ${VERSION}
EXPORT_SET arrow_targets
# TODO: What about all the other GLOBAL_TARGETS specified in the
# `rapids_cpm_find` command? Also, should this be constructed
# conditionally based on whether we're using shared or static libs for
# arrow?
GLOBAL_TARGETS arrow_shared arrow_static
NAMESPACE cudf::
FINAL_CODE_BLOCK arrow_code_string
Expand Down Expand Up @@ -279,6 +348,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
endif()
endif()
# We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
# TODO: Isn't the second call here redundant? The above `rapids_export`
# command will also encode a `find_dependency` in the INSTALL export set.
# It's less obvious whether we should still include the first
# `rapids_export_package` below since that will translate to a
# `find_dependency` call in the build config and `rapids-export` will put a
# `CPMFindPackage` there instead; is our goal to add `find_dependency`? Since
# it'll happen after `CPMFindPackage` I think it'll just be redundant.
rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)

Expand All @@ -302,7 +378,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
"${ARROW_LIBRARIES}"
PARENT_SCOPE
)

endfunction()

if(NOT DEFINED CUDF_VERSION_Arrow)
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,9 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
arrays.end(),
metadata.begin(),
std::back_inserter(fields),
[](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); });
[](auto const& array, auto const& meta) { return std::make_shared<arrow::Field>(meta.name, array->type()); });
vyasr marked this conversation as resolved.
Show resolved Hide resolved

auto result = arrow::Table::Make(arrow::schema(fields), arrays);
auto result = arrow::Table::Make(arrow::schema(fields), arrays, input.num_rows());

// synchronize the stream because after the return the data may be accessed from the host before
// the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,26 @@ project(
option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
OFF
)
option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

# If the user requested it we attempt to find CUDF.
if(FIND_CUDF_CPP)
if(USE_LIBARROW_FROM_PYARROW)
# We need to find arrow before libcudf since libcudf requires it but
# doesn't bundle it.
# TODO: These options should probably all become optional since in practice
# they aren't meaningful except in the case where we actually compile Arrow.
set(CUDF_USE_ARROW_STATIC OFF)
set(CUDF_ENABLE_ARROW_S3 OFF)
set(CUDF_ENABLE_ARROW_ORC OFF)
set(CUDF_ENABLE_ARROW_PYTHON OFF)
set(CUDF_ENABLE_ARROW_PARQUET OFF)
include(rapids-find)
include(rapids-export)
include(../../cpp/cmake/thirdparty/get_arrow.cmake)
endif()

find_package(cudf ${cudf_version} REQUIRED)
else()
set(cudf_FOUND OFF)
Expand All @@ -59,6 +76,15 @@ if(NOT cudf_FOUND)
install(TARGETS cudf DESTINATION cudf/_lib/cpp)
endif()

if(USE_LIBARROW_FROM_PYARROW)
# TODO: It would be nice to avoid setting these globally, but we'd need to
# manually set this for each of the rapids-cython targets otherwise which
# seems undesirable. For now I'm setting this after the
# `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0")
endif()

include(rapids-cython)
rapids_cython_init()

Expand Down