Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA] Add options to build Arrow with Python and Parquet support #8670

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks"
option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON)
option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF)
Expand Down Expand Up @@ -272,7 +274,7 @@ add_library(cudf
src/join/join.cu
src/join/semi_join.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
src/lists/combine/concatenate_list_elements.cu
src/lists/combine/concatenate_rows.cu
src/lists/copying/concatenate.cu
src/lists/copying/copying.cu
Expand Down
39 changes: 32 additions & 7 deletions cpp/cmake/thirdparty/CUDF_GetArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
# limitations under the License.
#=============================================================================

function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET)

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(ARROW_BUILD_S3 OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
Expand All @@ -36,10 +35,23 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
set(CPMAddOrFindPackage CPMAddPackage)
endif()

if(ENABLE_S3)
set(ARROW_BUILD_S3 ON)
set(ARROW_PYTHON_OPTIONS "")
if(ENABLE_PYTHON)
list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
# Arrow's logic to build Boost from source is busted, so we have to get it from the system.
list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
# Arrow's logic to find Thrift is busted, so we have to build it from
# source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask?
# Because that's _also_ busted. The only thing that seems to is to set
# _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to
# SYSTEM.
list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED")
endif()

# Set this so Arrow correctly finds the CUDA toolkit when the build machine
# does not have the CUDA driver installed. This must be an env var.
set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")

cmake_language(CALL ${CPMAddOrFindPackage}
NAME Arrow
VERSION ${VERSION}
Expand All @@ -55,7 +67,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
"ARROW_WITH_BACKTRACE ON"
"ARROW_CXXFLAGS -w"
"ARROW_JEMALLOC OFF"
"ARROW_S3 ${ARROW_BUILD_S3}"
"ARROW_S3 ${ENABLE_S3}"
# e.g. needed by blazingsql-io
"ARROW_PARQUET ${ENABLE_PARQUET}"
${ARROW_PYTHON_OPTIONS}
# Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
"ARROW_USE_CCACHE OFF"
"ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
Expand Down Expand Up @@ -98,13 +113,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util")
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu")
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet")
endif()
###
# This shouldn't be necessary!
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static`
# and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively,
# so for static source-builds, we have to do it after-the-fact.
#
#
# This only works because we know exactly which components we're using.
# Don't forget to update this list if we add more!
###
Expand All @@ -129,4 +148,10 @@ endfunction()

set(CUDF_VERSION_Arrow 4.0.1)

find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3})
find_and_configure_arrow(
${CUDF_VERSION_Arrow}
${CUDF_USE_ARROW_STATIC}
${CUDF_ENABLE_ARROW_S3}
${CUDF_ENABLE_ARROW_PYTHON}
${CUDF_ENABLE_ARROW_PARQUET}
)