From 59df97b0ec0e90d0ef666222a118072e952428a2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 10:25:40 -0700 Subject: [PATCH 01/20] Minimal set of changes to have CMake find libarrow inside a pip wheel. --- cpp/cmake/thirdparty/get_arrow.cmake | 71 ++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 9fa5b9d1658..64e3600628d 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -20,35 +20,64 @@ # cmake-lint: disable=R0912,R0913,R0915 +include_guard(GLOBAL) + +# Generate a FindArrow module for the case where we need to search for arrow +# within a pip install pyarrow. +function(find_libarrow_in_python_wheel VERSION) + function(find_arrow_lib _name _alias _lib) + # TODO: For now I'm hardcoding the path, that should be changed to use execute_process. + set(CUDF_PYARROW_WHEEL_DIR /home/vyasr/home/miniconda3/envs/cudf_dev/lib/python3.9/site-packages/pyarrow) + if(CUDF_PYARROW_WHEEL_DIR) + list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") + endif() + rapids_find_generate_module( + "${_name}" + NO_CONFIG + VERSION "${VERSION}" + LIBRARY_NAMES "${_lib}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h + ) + + find_package(${_name} ${VERSION} MODULE REQUIRED GLOBAL) + add_library(${_alias} ALIAS ${_name}::${_name}) + + if(CUDF_PYARROW_WHEEL_DIR) + list(POP_BACK CMAKE_PREFIX_PATH) + endif() + endfunction() + + string(REPLACE "." "" PYARROW_SO_VER "${VERSION}") + find_arrow_lib(Arrow arrow_shared libarrow.so.${PYARROW_SO_VER}) +endfunction() + # This function finds arrow and sets any additional necessary environment variables. function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET ) + # TODO: Hardcoding for now. + set(USE_ARROW_PIP TRUE) + if(USE_ARROW_PIP) + # Generate a FindArrow.cmake to find pyarrow's libarrow.so + find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) + return() + endif() + if(BUILD_STATIC) if(TARGET arrow_static) - list(APPEND ARROW_LIBRARIES arrow_static) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} - PARENT_SCOPE - ) + set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_LIBRARIES arrow_static PARENT_SCOPE) return() endif() else() if(TARGET arrow_shared) - list(APPEND ARROW_LIBRARIES arrow_shared) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} - PARENT_SCOPE - ) + set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) return() endif() endif() @@ -92,6 +121,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared + arrow_static parquet_static arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -128,8 +158,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_FOUND TRUE) set(ARROW_LIBRARIES "") - # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the - # system/conda/etc. + # Arrow_ADDED: set if CPM downloaded Arrow from Github + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. if(Arrow_ADDED OR Arrow_DIR) if(BUILD_STATIC) list(APPEND ARROW_LIBRARIES arrow_static) @@ -302,7 +332,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "${ARROW_LIBRARIES}" PARENT_SCOPE ) - endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) From 1efbb8c44e1ff1c8f502eaf115a83d76d126d02e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 10:28:35 -0700 Subject: [PATCH 02/20] Add TODOs indicating potential areas for improvement in the CMake code. --- cpp/cmake/thirdparty/get_arrow.cmake | 29 ++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 64e3600628d..9ed0e98ce21 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -68,6 +68,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB return() endif() + # TODO: How would the targets have been created before this function is + # called? Is that really something we should support? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -82,8 +84,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() - set(ARROW_BUILD_SHARED ON) - set(ARROW_BUILD_STATIC OFF) + # TODO: AFAICT this variable is not used anywhere. Safe to remove? set(CPMAddOrFindPackage CPMFindPackage) if(NOT ARROW_ARMV8_ARCH) @@ -99,7 +100,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library + # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This is a big hammer. set(CPM_DOWNLOAD_ALL TRUE) + else() + set(ARROW_BUILD_SHARED ON) + set(ARROW_BUILD_STATIC OFF) endif() set(ARROW_PYTHON_OPTIONS "") @@ -120,6 +125,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_cpm_find( Arrow ${VERSION} + # TODO: Should we set the list of global targets conditionally based on + # whether we're building shared/static and whether parquet is enabled? GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static arrow_dataset_static CPM_ARGS @@ -155,12 +162,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "xsimd_SOURCE AUTO" ) + # TODO: Will this not get set by rapids_cpm_find? set(ARROW_FOUND TRUE) set(ARROW_LIBRARIES "") # Arrow_ADDED: set if CPM downloaded Arrow from Github # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. if(Arrow_ADDED OR Arrow_DIR) + # TODO: We can set the ARROW_LIBRARIES in a single place based on + # BUILD_STATIC at the very beginning of this function and then get rid of + # the if directly above this one. Then this whole if/else becomes: + # if(Arrow_DIR)... elseif(Arrow_ADDED)...else() message(FATAL_ERROR) if(BUILD_STATIC) list(APPEND ARROW_LIBRARIES arrow_static) else() @@ -168,6 +180,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() if(Arrow_DIR) + # TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`?? find_package(Arrow REQUIRED QUIET) if(ENABLE_PARQUET) if(NOT Parquet_DIR) @@ -213,6 +226,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") endif() + # TODO: This can go into the block above for Arrow_ADDED if(Arrow_ADDED) set(arrow_code_string @@ -260,6 +274,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets + # TODO: What about all the other GLOBAL_TARGETS specified in the + # `rapids_cpm_find` command? Also, should this be constructed + # conditionally based on whether we're using shared or static libs for + # arrow? GLOBAL_TARGETS arrow_shared arrow_static NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_code_string @@ -309,6 +327,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` + # TODO: Isn't the second call here redundant? The above `rapids_export` + # command will also encode a `find_dependency` in the INSTALL export set. + # It's less obvious whether we should still include the first + # `rapids_export_package` below since that will translate to a + # `find_dependency` call in the build config and `rapids-export` will put a + # `CPMFindPackage` there instead; is our goal to add `find_dependency`? Since + # it'll happen after `CPMFindPackage` I think it'll just be redundant. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) From 9faa72e883284a8de5f1480d3f7da3a1a2d6a0fb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 12:30:32 -0700 Subject: [PATCH 03/20] Minimal set of changes to get a working cuDF Python against the pyarrow lib. --- cpp/cmake/thirdparty/get_arrow.cmake | 18 ++++++++++++++++++ cpp/src/interop/to_arrow.cu | 4 ++-- python/cudf/CMakeLists.txt | 21 +++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 9ed0e98ce21..50987abc6b4 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -44,6 +44,9 @@ function(find_libarrow_in_python_wheel VERSION) find_package(${_name} ${VERSION} MODULE REQUIRED GLOBAL) add_library(${_alias} ALIAS ${_name}::${_name}) + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + if(CUDF_PYARROW_WHEEL_DIR) list(POP_BACK CMAKE_PREFIX_PATH) endif() @@ -65,6 +68,21 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB find_libarrow_in_python_wheel(${VERSION}) set(ARROW_FOUND TRUE PARENT_SCOPE) set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) + + # When using the libarrow inside a wheel we must build libcudf with the old + # ABI because pyarrow's `libarrow.so` is compiled for manylinux2014 + # (centos7 toolchain) which uses the old ABI. + # TODO: Tests will not build successfully now without also propagating + # these options to builds of GTest. Similarly, benchmarks will not work + # without updating GBench (and possibly NVBench) builds. Unclear if we are + # going to test that regularly anyway, though, so we may not need to update + # that. + # TODO: How to propagate these flags up to the Python build? + list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) + list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) + set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) + set(CUDF_CUDA_FLAGS "${CUDF_CUDA_FLAGS}" PARENT_SCOPE) + return() endif() diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index fb203e6c3c1..9d9c24b1ff1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -398,9 +398,9 @@ std::shared_ptr to_arrow(table_view input, arrays.end(), metadata.begin(), std::back_inserter(fields), - [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); }); + [](auto const& array, auto const& meta) { return std::make_shared(meta.name, array->type()); }); - auto result = arrow::Table::Make(arrow::schema(fields), arrays); + auto result = arrow::Table::Make(arrow::schema(fields), arrays, input.num_rows()); // synchronize the stream because after the return the data may be accessed from the host before // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 0781a38e6ad..f0c19549a4a 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -34,6 +34,27 @@ option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulti # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) + set(USE_ARROW_PIP TRUE) + if(USE_ARROW_PIP) + # TODO: These options should probably all become optional since in practice + # they aren't meaningful except in the case where we actually compile Arrow. + set(CUDF_USE_ARROW_STATIC OFF) + set(CUDF_ENABLE_ARROW_S3 OFF) + set(CUDF_ENABLE_ARROW_ORC OFF) + SET(CUDF_ENABLE_ARROW_PYTHON OFF) + set(CUDF_ENABLE_ARROW_PARQUET OFF) + include(rapids-find) + include(rapids-export) + include(../../cpp/cmake/thirdparty/get_arrow.cmake) + + # TODO: We definitely shouldn't be setting these globally for all + # compilation, this is just a shortcut for now. Also when compiling against + # a static libcudf this shouldn't matter since we'll just get the right ABI + # symbols "for free". + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0") + endif() + find_package(cudf ${cudf_version} REQUIRED) else() set(cudf_FOUND OFF) From 6a44b42cc2c12d52c45b13ba7826f37426965e8c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 13:16:16 -0700 Subject: [PATCH 04/20] Some more notes. --- cpp/cmake/thirdparty/get_arrow.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 50987abc6b4..bb4f7b78cf1 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -44,6 +44,9 @@ function(find_libarrow_in_python_wheel VERSION) find_package(${_name} ${VERSION} MODULE REQUIRED GLOBAL) add_library(${_alias} ALIAS ${_name}::${_name}) + # TODO: Should these both be here? I think so, the + # `rapids_find_generate_module` doesn't seems to create the necessary + # `find_dependency` calls in the config files. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) @@ -78,6 +81,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # going to test that regularly anyway, though, so we may not need to update # that. # TODO: How to propagate these flags up to the Python build? + # Note: These flags will be redundant in some cases where we build wheels + # in manylinux containers that actually have the old libc++, but not in all + # cases e.g. aarch builds on newer manylinux or testing builds in newer + # containers. list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) From 770322d9c12bbf8282847c45103cf85569a2c081 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 13:16:31 -0700 Subject: [PATCH 05/20] Remove unused variable. --- cpp/cmake/thirdparty/get_arrow.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index bb4f7b78cf1..05451e74dbe 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -109,9 +109,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() - # TODO: AFAICT this variable is not used anywhere. Safe to remove? - set(CPMAddOrFindPackage CPMFindPackage) - if(NOT ARROW_ARMV8_ARCH) set(ARROW_ARMV8_ARCH "armv8-a") endif() From 638258e1479321bbd38d1717ce796db7633e59b7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 14:16:50 -0700 Subject: [PATCH 06/20] Find arrow libs using execute_process. --- cpp/cmake/thirdparty/get_arrow.cmake | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 05451e74dbe..a52df86ab5d 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -26,11 +26,14 @@ include_guard(GLOBAL) # within a pip install pyarrow. function(find_libarrow_in_python_wheel VERSION) function(find_arrow_lib _name _alias _lib) - # TODO: For now I'm hardcoding the path, that should be changed to use execute_process. - set(CUDF_PYARROW_WHEEL_DIR /home/vyasr/home/miniconda3/envs/cudf_dev/lib/python3.9/site-packages/pyarrow) - if(CUDF_PYARROW_WHEEL_DIR) - list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") - endif() + find_package(Python REQUIRED) + execute_process( + COMMAND "${Python_EXECUTABLE}" + -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") rapids_find_generate_module( "${_name}" NO_CONFIG @@ -50,9 +53,7 @@ function(find_libarrow_in_python_wheel VERSION) rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) - if(CUDF_PYARROW_WHEEL_DIR) - list(POP_BACK CMAKE_PREFIX_PATH) - endif() + list(POP_BACK CMAKE_PREFIX_PATH) endfunction() string(REPLACE "." "" PYARROW_SO_VER "${VERSION}") From d5bb59fe808140bc9d956e501666c13f7a509e94 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 28 Oct 2022 14:49:57 -0700 Subject: [PATCH 07/20] Remove unnecessary nested function. --- cpp/cmake/thirdparty/get_arrow.cmake | 60 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index a52df86ab5d..7c89e0355a4 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -24,40 +24,38 @@ include_guard(GLOBAL) # Generate a FindArrow module for the case where we need to search for arrow # within a pip install pyarrow. -function(find_libarrow_in_python_wheel VERSION) - function(find_arrow_lib _name _alias _lib) - find_package(Python REQUIRED) - execute_process( - COMMAND "${Python_EXECUTABLE}" - -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" - OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") - rapids_find_generate_module( - "${_name}" - NO_CONFIG - VERSION "${VERSION}" - LIBRARY_NAMES "${_lib}" - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - HEADER_NAMES arrow/python/arrow_to_pandas.h - ) - - find_package(${_name} ${VERSION} MODULE REQUIRED GLOBAL) - add_library(${_alias} ALIAS ${_name}::${_name}) +function(find_libarrow_in_python_wheel PYARROW_VERSION) + string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}") + set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER}) + + find_package(Python REQUIRED) + execute_process( + COMMAND "${Python_EXECUTABLE}" + -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") + rapids_find_generate_module( + Arrow + NO_CONFIG + VERSION "${PYARROW_VERSION}" + LIBRARY_NAMES "${PYARROW_LIB}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h + ) - # TODO: Should these both be here? I think so, the - # `rapids_find_generate_module` doesn't seems to create the necessary - # `find_dependency` calls in the config files. - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) + find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) + add_library(arrow_shared ALIAS Arrow::Arrow) - list(POP_BACK CMAKE_PREFIX_PATH) - endfunction() + # TODO: Should these both be here? I think so, the + # `rapids_find_generate_module` doesn't seems to create the necessary + # `find_dependency` calls in the config files. + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) - string(REPLACE "." "" PYARROW_SO_VER "${VERSION}") - find_arrow_lib(Arrow arrow_shared libarrow.so.${PYARROW_SO_VER}) + list(POP_BACK CMAKE_PREFIX_PATH) endfunction() # This function finds arrow and sets any additional necessary environment variables. From b9d4c7b055894fd457aec7180b73b31390265111 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Oct 2022 15:09:01 -0700 Subject: [PATCH 08/20] Make the usage of pyarrow's libarrow a CMake option and clean up some comments. --- cpp/CMakeLists.txt | 2 ++ cpp/cmake/thirdparty/get_arrow.cmake | 27 +++++++++++++-------------- python/cudf/CMakeLists.txt | 5 +++-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 289c432dea5..09f975ee414 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -68,6 +68,8 @@ option(CUDA_ENABLE_LINEINFO ) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 7c89e0355a4..f9888d85797 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -63,9 +63,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ENABLE_PARQUET ) - # TODO: Hardcoding for now. - set(USE_ARROW_PIP TRUE) - if(USE_ARROW_PIP) + if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -73,17 +71,16 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # When using the libarrow inside a wheel we must build libcudf with the old # ABI because pyarrow's `libarrow.so` is compiled for manylinux2014 - # (centos7 toolchain) which uses the old ABI. + # (centos7 toolchain) which uses the old ABI. Note that these flags will + # often be redundant because we build wheels in manylinux containers that + # actually have the old libc++ anyway, but setting them explicitly ensures + # correct and consistent behavior in all other cases such as aarch builds + # on newer manylinux or testing builds in newer containers. # TODO: Tests will not build successfully now without also propagating # these options to builds of GTest. Similarly, benchmarks will not work - # without updating GBench (and possibly NVBench) builds. Unclear if we are - # going to test that regularly anyway, though, so we may not need to update - # that. - # TODO: How to propagate these flags up to the Python build? - # Note: These flags will be redundant in some cases where we build wheels - # in manylinux containers that actually have the old libc++, but not in all - # cases e.g. aarch builds on newer manylinux or testing builds in newer - # containers. + # without updating GBench (and possibly NVBench) builds. Do we care about + # that since we don't anticipate using this feature except when building + # wheels? list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) @@ -93,7 +90,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() # TODO: How would the targets have been created before this function is - # called? Is that really something we should support? + # called? Is this just to ensure that the function is idempotent? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -121,7 +118,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library - # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This is a big hammer. + # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) + # instead? This is a big hammer, although I guess it is effectively limited + # which packages will be affected by the scope of the variable here. set(CPM_DOWNLOAD_ALL TRUE) else() set(ARROW_BUILD_SHARED ON) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index f0c19549a4a..2fee9d898fe 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -31,11 +31,12 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) - set(USE_ARROW_PIP TRUE) - if(USE_ARROW_PIP) + if(USE_LIBARROW_FROM_PYARROW) # TODO: These options should probably all become optional since in practice # they aren't meaningful except in the case where we actually compile Arrow. set(CUDF_USE_ARROW_STATIC OFF) From fe0aec4684e756cf8eff71d279e61c974a1f26fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Oct 2022 16:24:42 -0700 Subject: [PATCH 09/20] Revert "Make the usage of pyarrow's libarrow a CMake option and clean up some comments." This reverts commit b9d4c7b055894fd457aec7180b73b31390265111. --- cpp/CMakeLists.txt | 2 -- cpp/cmake/thirdparty/get_arrow.cmake | 27 ++++++++++++++------------- python/cudf/CMakeLists.txt | 5 ++--- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 09f975ee414..289c432dea5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -68,8 +68,6 @@ option(CUDA_ENABLE_LINEINFO ) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) -option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index f9888d85797..7c89e0355a4 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -63,7 +63,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ENABLE_PARQUET ) - if(USE_LIBARROW_FROM_PYARROW) + # TODO: Hardcoding for now. + set(USE_ARROW_PIP TRUE) + if(USE_ARROW_PIP) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -71,16 +73,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # When using the libarrow inside a wheel we must build libcudf with the old # ABI because pyarrow's `libarrow.so` is compiled for manylinux2014 - # (centos7 toolchain) which uses the old ABI. Note that these flags will - # often be redundant because we build wheels in manylinux containers that - # actually have the old libc++ anyway, but setting them explicitly ensures - # correct and consistent behavior in all other cases such as aarch builds - # on newer manylinux or testing builds in newer containers. + # (centos7 toolchain) which uses the old ABI. # TODO: Tests will not build successfully now without also propagating # these options to builds of GTest. Similarly, benchmarks will not work - # without updating GBench (and possibly NVBench) builds. Do we care about - # that since we don't anticipate using this feature except when building - # wheels? + # without updating GBench (and possibly NVBench) builds. Unclear if we are + # going to test that regularly anyway, though, so we may not need to update + # that. + # TODO: How to propagate these flags up to the Python build? + # Note: These flags will be redundant in some cases where we build wheels + # in manylinux containers that actually have the old libc++, but not in all + # cases e.g. aarch builds on newer manylinux or testing builds in newer + # containers. list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) @@ -90,7 +93,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() # TODO: How would the targets have been created before this function is - # called? Is this just to ensure that the function is idempotent? + # called? Is that really something we should support? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -118,9 +121,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library - # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) - # instead? This is a big hammer, although I guess it is effectively limited - # which packages will be affected by the scope of the variable here. + # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This is a big hammer. set(CPM_DOWNLOAD_ALL TRUE) else() set(ARROW_BUILD_SHARED ON) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 2fee9d898fe..f0c19549a4a 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -31,12 +31,11 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) -option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) - if(USE_LIBARROW_FROM_PYARROW) + set(USE_ARROW_PIP TRUE) + if(USE_ARROW_PIP) # TODO: These options should probably all become optional since in practice # they aren't meaningful except in the case where we actually compile Arrow. set(CUDF_USE_ARROW_STATIC OFF) From 7429fb7c845cc3e8a3b6976d7f24b6877f229e78 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Oct 2022 16:28:16 -0700 Subject: [PATCH 10/20] Fix case. --- python/cudf/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index f0c19549a4a..7ab32180dc0 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -41,7 +41,7 @@ if(FIND_CUDF_CPP) set(CUDF_USE_ARROW_STATIC OFF) set(CUDF_ENABLE_ARROW_S3 OFF) set(CUDF_ENABLE_ARROW_ORC OFF) - SET(CUDF_ENABLE_ARROW_PYTHON OFF) + set(CUDF_ENABLE_ARROW_PYTHON OFF) set(CUDF_ENABLE_ARROW_PARQUET OFF) include(rapids-find) include(rapids-export) From 8cf93a5ee50f73a07553c2ba532147b34cb68e48 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Oct 2022 16:39:52 -0700 Subject: [PATCH 11/20] Clean up some comments. --- cpp/cmake/thirdparty/get_arrow.cmake | 27 ++++++++++++++------------- python/cudf/CMakeLists.txt | 4 ++-- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 7c89e0355a4..585ace44112 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -64,8 +64,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ) # TODO: Hardcoding for now. - set(USE_ARROW_PIP TRUE) - if(USE_ARROW_PIP) + set(USE_LIBARROW_FROM_PYARROW TRUE) + if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -73,17 +73,16 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # When using the libarrow inside a wheel we must build libcudf with the old # ABI because pyarrow's `libarrow.so` is compiled for manylinux2014 - # (centos7 toolchain) which uses the old ABI. + # (centos7 toolchain) which uses the old ABI. Note that these flags will + # often be redundant because we build wheels in manylinux containers that + # actually have the old libc++ anyway, but setting them explicitly ensures + # correct and consistent behavior in all other cases such as aarch builds + # on newer manylinux or testing builds in newer containers. # TODO: Tests will not build successfully now without also propagating # these options to builds of GTest. Similarly, benchmarks will not work - # without updating GBench (and possibly NVBench) builds. Unclear if we are - # going to test that regularly anyway, though, so we may not need to update - # that. - # TODO: How to propagate these flags up to the Python build? - # Note: These flags will be redundant in some cases where we build wheels - # in manylinux containers that actually have the old libc++, but not in all - # cases e.g. aarch builds on newer manylinux or testing builds in newer - # containers. + # without updating GBench (and possibly NVBench) builds. Do we care about + # that since we don't anticipate using this feature except when building + # wheels? list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) @@ -93,7 +92,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() # TODO: How would the targets have been created before this function is - # called? Is that really something we should support? + # called? Is this just to ensure that the function is idempotent? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE) @@ -121,7 +120,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library - # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This is a big hammer. + # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) + # instead? This is a big hammer, although I guess it is effectively limited + # which packages will be affected by the scope of the variable here. set(CPM_DOWNLOAD_ALL TRUE) else() set(ARROW_BUILD_SHARED ON) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 7ab32180dc0..a5043b7870b 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -34,8 +34,8 @@ option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulti # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) - set(USE_ARROW_PIP TRUE) - if(USE_ARROW_PIP) + set(USE_LIBARROW_FROM_PYARROW TRUE) + if(USE_LIBARROW_FROM_PYARROW) # TODO: These options should probably all become optional since in practice # they aren't meaningful except in the case where we actually compile Arrow. set(CUDF_USE_ARROW_STATIC OFF) From cb817ba9c302b4df3a0eb642030fa3a4e3e382a4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Oct 2022 18:31:12 -0700 Subject: [PATCH 12/20] Make building against pyarrow libarrow an option. --- cpp/CMakeLists.txt | 2 ++ cpp/cmake/thirdparty/get_arrow.cmake | 5 ++--- python/cudf/CMakeLists.txt | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 289c432dea5..09f975ee414 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -68,6 +68,8 @@ option(CUDA_ENABLE_LINEINFO ) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 585ace44112..785ee7f9cc1 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -63,8 +63,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ENABLE_PARQUET ) - # TODO: Hardcoding for now. - set(USE_LIBARROW_FROM_PYARROW TRUE) if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) @@ -92,7 +90,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() # TODO: How would the targets have been created before this function is - # called? Is this just to ensure that the function is idempotent? + # called? Is this just to ensure that the function is idempotent? If so, why + # do we need to set these variables again? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index a5043b7870b..246f386d377 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -31,11 +31,14 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) - set(USE_LIBARROW_FROM_PYARROW TRUE) if(USE_LIBARROW_FROM_PYARROW) + # We need to find arrow before libcudf since libcudf requires it but + # doesn't bundle it. # TODO: These options should probably all become optional since in practice # they aren't meaningful except in the case where we actually compile Arrow. set(CUDF_USE_ARROW_STATIC OFF) @@ -46,13 +49,6 @@ if(FIND_CUDF_CPP) include(rapids-find) include(rapids-export) include(../../cpp/cmake/thirdparty/get_arrow.cmake) - - # TODO: We definitely shouldn't be setting these globally for all - # compilation, this is just a shortcut for now. Also when compiling against - # a static libcudf this shouldn't matter since we'll just get the right ABI - # symbols "for free". - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0") endif() find_package(cudf ${cudf_version} REQUIRED) @@ -80,6 +76,15 @@ if(NOT cudf_FOUND) install(TARGETS cudf DESTINATION cudf/_lib/cpp) endif() +if(USE_LIBARROW_FROM_PYARROW) + # TODO: It would be nice to avoid setting these globally, but we'd need to + # manually set this for each of the rapids-cython targets otherwise which + # seems undesirable. For now I'm setting this after the + # `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0") +endif() + include(rapids-cython) rapids_cython_init() From 79d6ce703d62820ce87234f3ed2c193c0295b291 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Nov 2022 13:53:06 -0700 Subject: [PATCH 13/20] Remove unnecessary to_arrow change. --- cpp/src/interop/to_arrow.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 9d9c24b1ff1..1d85f6b98e4 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -398,7 +398,7 @@ std::shared_ptr to_arrow(table_view input, arrays.end(), metadata.begin(), std::back_inserter(fields), - [](auto const& array, auto const& meta) { return std::make_shared(meta.name, array->type()); }); + [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); }); auto result = arrow::Table::Make(arrow::schema(fields), arrays, input.num_rows()); From df53eae887e5cc7607ed3388b6479182b4f567ae Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Nov 2022 14:11:13 -0700 Subject: [PATCH 14/20] More cleanup of get_arrow.cmake. --- cpp/cmake/thirdparty/get_arrow.cmake | 112 ++++++++++++--------------- 1 file changed, 48 insertions(+), 64 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 785ee7f9cc1..b59fa290e22 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -184,70 +184,63 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ) # TODO: Will this not get set by rapids_cpm_find? - set(ARROW_FOUND TRUE) - set(ARROW_LIBRARIES "") + set(ARROW_FOUND TRUE PARENT_SCOPE) - # Arrow_ADDED: set if CPM downloaded Arrow from Github - # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. - if(Arrow_ADDED OR Arrow_DIR) - # TODO: We can set the ARROW_LIBRARIES in a single place based on - # BUILD_STATIC at the very beginning of this function and then get rid of - # the if directly above this one. Then this whole if/else becomes: - # if(Arrow_DIR)... elseif(Arrow_ADDED)...else() message(FATAL_ERROR) - if(BUILD_STATIC) - list(APPEND ARROW_LIBRARIES arrow_static) - else() - list(APPEND ARROW_LIBRARIES arrow_shared) - endif() + if(BUILD_STATIC) + set(ARROW_LIBRARIES arrow_static PARENT_SCOPE) + else() + set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) + endif() - if(Arrow_DIR) - # TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`?? - find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)` - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. + if(Arrow_DIR) + # TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`?? + find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") endif() - elseif(Arrow_ADDED) - # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to - # target_include_directories. That defeats ccache. - file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + # Set this to enable `find_package(ArrowDataset)` + # TODO: Should this be conditional like the Parquet_DIR setting above? + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) + endif() + # Arrow_ADDED: set if CPM downloaded Arrow from Github + elseif(Arrow_ADDED) + # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to + # target_include_directories. That defeats ccache. + file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + ) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() - # - # This shouldn't be necessary! - # - # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` - # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. - # - # This only works because we know exactly which components we're using. Don't forget to update - # this list if we add more! - # - foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) - target_include_directories( - ${ARROW_LIBRARY} - INTERFACE "$" - "$" - "$" - "$" - ) - endforeach() endif() + # + # This shouldn't be necessary! + # + # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` + # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. + # + # This only works because we know exactly which components we're using. Don't forget to update + # this list if we add more! + # + foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) + target_include_directories( + ${ARROW_LIBRARY} + INTERFACE "$" + "$" + "$" + "$" + ) + endforeach() else() - set(ARROW_FOUND FALSE) + set(ARROW_FOUND FALSE PARENT_SCOPE) message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") endif() - # TODO: This can go into the block above for Arrow_ADDED if(Arrow_ADDED) set(arrow_code_string @@ -369,15 +362,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) endif() - - set(ARROW_FOUND - "${ARROW_FOUND}" - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - "${ARROW_LIBRARIES}" - PARENT_SCOPE - ) endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) From 498d50903cd12c6a050d360fc49263a806cf20f5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Nov 2022 14:14:09 -0700 Subject: [PATCH 15/20] Remove one more unnecessary arrow workaround. --- cpp/src/interop/to_arrow.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 1d85f6b98e4..fb203e6c3c1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -400,7 +400,7 @@ std::shared_ptr to_arrow(table_view input, std::back_inserter(fields), [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); }); - auto result = arrow::Table::Make(arrow::schema(fields), arrays, input.num_rows()); + auto result = arrow::Table::Make(arrow::schema(fields), arrays); // synchronize the stream because after the return the data may be accessed from the host before // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host From 49cfe3cbf39fd32c8ac707caa3d3315f346ec9d2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Nov 2022 14:45:27 -0700 Subject: [PATCH 16/20] Apply cmake-format. --- cpp/cmake/thirdparty/get_arrow.cmake | 155 ++++++++++++++++----------- python/cudf/CMakeLists.txt | 14 ++- 2 files changed, 97 insertions(+), 72 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index b59fa290e22..332d40352d0 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -22,23 +22,21 @@ include_guard(GLOBAL) -# Generate a FindArrow module for the case where we need to search for arrow -# within a pip install pyarrow. +# Generate a FindArrow module for the case where we need to search for arrow within a pip install +# pyarrow. function(find_libarrow_in_python_wheel PYARROW_VERSION) string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}") set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER}) find_package(Python REQUIRED) execute_process( - COMMAND "${Python_EXECUTABLE}" - -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" - OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - ) + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") rapids_find_generate_module( - Arrow - NO_CONFIG + Arrow NO_CONFIG VERSION "${PYARROW_VERSION}" LIBRARY_NAMES "${PYARROW_LIB}" BUILD_EXPORT_SET cudf-exports @@ -49,9 +47,8 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION) find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) add_library(arrow_shared ALIAS Arrow::Arrow) - # TODO: Should these both be here? I think so, the - # `rapids_find_generate_module` doesn't seems to create the necessary - # `find_dependency` calls in the config files. + # TODO: Should these both be here? I think so, the `rapids_find_generate_module` doesn't seems to + # create the necessary `find_dependency` calls in the config files. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) @@ -66,42 +63,62 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) - set(ARROW_FOUND TRUE PARENT_SCOPE) - set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) - - # When using the libarrow inside a wheel we must build libcudf with the old - # ABI because pyarrow's `libarrow.so` is compiled for manylinux2014 - # (centos7 toolchain) which uses the old ABI. Note that these flags will - # often be redundant because we build wheels in manylinux containers that - # actually have the old libc++ anyway, but setting them explicitly ensures - # correct and consistent behavior in all other cases such as aarch builds - # on newer manylinux or testing builds in newer containers. - # TODO: Tests will not build successfully now without also propagating - # these options to builds of GTest. Similarly, benchmarks will not work - # without updating GBench (and possibly NVBench) builds. Do we care about - # that since we don't anticipate using this feature except when building - # wheels? + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + + # When using the libarrow inside a wheel we must build libcudf with the old ABI because + # pyarrow's `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old + # ABI. Note that these flags will often be redundant because we build wheels in manylinux + # containers that actually have the old libc++ anyway, but setting them explicitly ensures + # correct and consistent behavior in all other cases such as aarch builds on newer manylinux or + # testing builds in newer containers. TODO: Tests will not build successfully now without also + # propagating these options to builds of GTest. Similarly, benchmarks will not work without + # updating GBench (and possibly NVBench) builds. Do we care about that since we don't anticipate + # using this feature except when building wheels? list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) - set(CUDF_CXX_FLAGS "${CUDF_CXX_FLAGS}" PARENT_SCOPE) - set(CUDF_CUDA_FLAGS "${CUDF_CUDA_FLAGS}" PARENT_SCOPE) + set(CUDF_CXX_FLAGS + "${CUDF_CXX_FLAGS}" + PARENT_SCOPE + ) + set(CUDF_CUDA_FLAGS + "${CUDF_CUDA_FLAGS}" + PARENT_SCOPE + ) return() endif() - # TODO: How would the targets have been created before this function is - # called? Is this just to ensure that the function is idempotent? If so, why - # do we need to set these variables again? + # TODO: How would the targets have been created before this function is called? Is this just to + # ensure that the function is idempotent? If so, why do we need to set these variables again? if(BUILD_STATIC) if(TARGET arrow_static) - set(ARROW_FOUND TRUE PARENT_SCOPE) - set(ARROW_LIBRARIES arrow_static PARENT_SCOPE) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_static + PARENT_SCOPE + ) return() endif() else() if(TARGET arrow_shared) - set(ARROW_FOUND TRUE PARENT_SCOPE) - set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) return() endif() endif() @@ -118,10 +135,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library - # TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) - # instead? This is a big hammer, although I guess it is effectively limited - # which packages will be affected by the scope of the variable here. + # library TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This + # is a big hammer, although I guess it is effectively limited which packages will be affected by + # the scope of the variable here. set(CPM_DOWNLOAD_ALL TRUE) else() set(ARROW_BUILD_SHARED ON) @@ -145,11 +161,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() rapids_cpm_find( - Arrow ${VERSION} - # TODO: Should we set the list of global targets conditionally based on - # whether we're building shared/static and whether parquet is enabled? - GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared - arrow_static parquet_static arrow_dataset_static + Arrow + ${VERSION} + # TODO: Should we set the list of global targets conditionally based on whether we're building + # shared/static and whether parquet is enabled? + GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static + arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -184,12 +201,21 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ) # TODO: Will this not get set by rapids_cpm_find? - set(ARROW_FOUND TRUE PARENT_SCOPE) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) if(BUILD_STATIC) - set(ARROW_LIBRARIES arrow_static PARENT_SCOPE) - else() - set(ARROW_LIBRARIES arrow_shared PARENT_SCOPE) + set(ARROW_LIBRARIES + arrow_static + PARENT_SCOPE + ) + else() + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) endif() # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. @@ -201,12 +227,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # Set this to enable `find_package(Parquet)` set(Parquet_DIR "${Arrow_DIR}") endif() - # Set this to enable `find_package(ArrowDataset)` - # TODO: Should this be conditional like the Parquet_DIR setting above? + # Set this to enable `find_package(ArrowDataset)` TODO: Should this be conditional like the + # Parquet_DIR setting above? set(ArrowDataset_DIR "${Arrow_DIR}") find_package(ArrowDataset REQUIRED QUIET) endif() - # Arrow_ADDED: set if CPM downloaded Arrow from Github + # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to # target_include_directories. That defeats ccache. @@ -237,7 +263,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB ) endforeach() else() - set(ARROW_FOUND FALSE PARENT_SCOPE) + set(ARROW_FOUND + FALSE + PARENT_SCOPE + ) message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") endif() @@ -288,10 +317,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets - # TODO: What about all the other GLOBAL_TARGETS specified in the - # `rapids_cpm_find` command? Also, should this be constructed - # conditionally based on whether we're using shared or static libs for - # arrow? + # TODO: What about all the other GLOBAL_TARGETS specified in the `rapids_cpm_find` command? + # Also, should this be constructed conditionally based on whether we're using shared or static + # libs for arrow? GLOBAL_TARGETS arrow_shared arrow_static NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_code_string @@ -341,13 +369,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - # TODO: Isn't the second call here redundant? The above `rapids_export` - # command will also encode a `find_dependency` in the INSTALL export set. - # It's less obvious whether we should still include the first - # `rapids_export_package` below since that will translate to a - # `find_dependency` call in the build config and `rapids-export` will put a - # `CPMFindPackage` there instead; is our goal to add `find_dependency`? Since - # it'll happen after `CPMFindPackage` I think it'll just be redundant. + # TODO: Isn't the second call here redundant? The above `rapids_export` command will also encode a + # `find_dependency` in the INSTALL export set. It's less obvious whether we should still include + # the first `rapids_export_package` below since that will translate to a `find_dependency` call in + # the build config and `rapids-export` will put a `CPMFindPackage` there instead; is our goal to + # add `find_dependency`? Since it'll happen after `CPMFindPackage` I think it'll just be + # redundant. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 2e42d2d890e..fca9523f371 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -37,10 +37,9 @@ mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # If the user requested it we attempt to find CUDF. if(FIND_CUDF_CPP) if(USE_LIBARROW_FROM_PYARROW) - # We need to find arrow before libcudf since libcudf requires it but - # doesn't bundle it. - # TODO: These options should probably all become optional since in practice - # they aren't meaningful except in the case where we actually compile Arrow. + # We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO: + # These options should probably all become optional since in practice they aren't meaningful + # except in the case where we actually compile Arrow. set(CUDF_USE_ARROW_STATIC OFF) set(CUDF_ENABLE_ARROW_S3 OFF) set(CUDF_ENABLE_ARROW_ORC OFF) @@ -80,10 +79,9 @@ if(NOT cudf_FOUND) endif() if(USE_LIBARROW_FROM_PYARROW) - # TODO: It would be nice to avoid setting these globally, but we'd need to - # manually set this for each of the rapids-cython targets otherwise which - # seems undesirable. For now I'm setting this after the - # `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough. + # TODO: It would be nice to avoid setting these globally, but we'd need to manually set this for + # each of the rapids-cython targets otherwise which seems undesirable. For now I'm setting this + # after the `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0") endif() From 713dd2c4451087b25839f6123034e47aa4d053f4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 7 Nov 2022 17:18:54 -0800 Subject: [PATCH 17/20] Address some reviews. --- cpp/cmake/thirdparty/get_arrow.cmake | 36 ++++++++-------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 332d40352d0..abea5fa5bcf 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -63,24 +63,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) - # When using the libarrow inside a wheel we must build libcudf with the old ABI because # pyarrow's `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old # ABI. Note that these flags will often be redundant because we build wheels in manylinux # containers that actually have the old libc++ anyway, but setting them explicitly ensures # correct and consistent behavior in all other cases such as aarch builds on newer manylinux or - # testing builds in newer containers. TODO: Tests will not build successfully now without also + # testing builds in newer containers. Note that tests will not build successfully without also # propagating these options to builds of GTest. Similarly, benchmarks will not work without - # updating GBench (and possibly NVBench) builds. Do we care about that since we don't anticipate - # using this feature except when building wheels? + # updating GBench (and possibly NVBench) builds. We are currently ignoring these limitations + # since we don't anticipate using this feature except for building wheels. list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) set(CUDF_CXX_FLAGS @@ -95,8 +86,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB return() endif() - # TODO: How would the targets have been created before this function is called? Is this just to - # ensure that the function is idempotent? If so, why do we need to set these variables again? if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -200,22 +189,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "xsimd_SOURCE AUTO" ) - # TODO: Will this not get set by rapids_cpm_find? set(ARROW_FOUND TRUE PARENT_SCOPE ) if(BUILD_STATIC) - set(ARROW_LIBRARIES - arrow_static - PARENT_SCOPE - ) + set(ARROW_LIBRARIES arrow_static) else() - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) + set(ARROW_LIBRARIES arrow_shared) endif() # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. @@ -244,9 +226,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" ) endif() - # - # This shouldn't be necessary! - # # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. # @@ -389,6 +368,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) endif() + + set(ARROW_LIBRARIES + "${ARROW_LIBRARIES}" + PARENT_SCOPE + ) endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) From 5e3f609b59b893113718da0e389dfa3718c7a6fd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Nov 2022 08:50:34 -0800 Subject: [PATCH 18/20] Next round of review. --- cpp/cmake/thirdparty/get_arrow.cmake | 48 ++++++++++------------------ 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index abea5fa5bcf..9b5f50749c8 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -47,6 +47,20 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION) find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) add_library(arrow_shared ALIAS Arrow::Arrow) + # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's + # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note + # that these flags will often be redundant because we build wheels in manylinux containers that + # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent + # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer + # containers. Note that tests will not build successfully without also propagating these options + # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly + # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using + # this feature except for building wheels. + target_compile_options( + Arrow::Arrow INTERFACE "$<$:-D_GLIBCXX_USE_CXX11_ABI=0>" + "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" + ) + # TODO: Should these both be here? I think so, the `rapids_find_generate_module` doesn't seems to # create the necessary `find_dependency` calls in the config files. rapids_export_package(BUILD Arrow cudf-exports) @@ -63,26 +77,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) - # When using the libarrow inside a wheel we must build libcudf with the old ABI because - # pyarrow's `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old - # ABI. Note that these flags will often be redundant because we build wheels in manylinux - # containers that actually have the old libc++ anyway, but setting them explicitly ensures - # correct and consistent behavior in all other cases such as aarch builds on newer manylinux or - # testing builds in newer containers. Note that tests will not build successfully without also - # propagating these options to builds of GTest. Similarly, benchmarks will not work without - # updating GBench (and possibly NVBench) builds. We are currently ignoring these limitations - # since we don't anticipate using this feature except for building wheels. - list(APPEND CUDF_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0) - list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0) - set(CUDF_CXX_FLAGS - "${CUDF_CXX_FLAGS}" - PARENT_SCOPE - ) - set(CUDF_CUDA_FLAGS - "${CUDF_CUDA_FLAGS}" - PARENT_SCOPE - ) - return() endif() @@ -124,10 +118,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library TODO: Can we use `CPM_DOWNLOAD_` (aka `CPM_DOWNLOAD_ARROW`) instead? This - # is a big hammer, although I guess it is effectively limited which packages will be affected by - # the scope of the variable here. - set(CPM_DOWNLOAD_ALL TRUE) + # library. + set(CPM_DOWNLOAD_Arrow TRUE) else() set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) @@ -150,10 +142,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() rapids_cpm_find( - Arrow - ${VERSION} - # TODO: Should we set the list of global targets conditionally based on whether we're building - # shared/static and whether parquet is enabled? + Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static arrow_dataset_static CPM_ARGS @@ -296,9 +285,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets - # TODO: What about all the other GLOBAL_TARGETS specified in the `rapids_cpm_find` command? - # Also, should this be constructed conditionally based on whether we're using shared or static - # libs for arrow? GLOBAL_TARGETS arrow_shared arrow_static NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_code_string From 2c9c7bbcff1e7731b0034c4291868cff00968a83 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Nov 2022 09:21:50 -0800 Subject: [PATCH 19/20] Put back accidentally removed propagation of arrow vars and remove now extraneous setting of compile options. --- cpp/cmake/thirdparty/get_arrow.cmake | 8 ++++++++ python/cudf/CMakeLists.txt | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 9b5f50749c8..0115bc772c6 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -77,6 +77,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(USE_LIBARROW_FROM_PYARROW) # Generate a FindArrow.cmake to find pyarrow's libarrow.so find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) return() endif() diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index fca9523f371..8a3224237b6 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -78,14 +78,6 @@ if(NOT cudf_FOUND) install(TARGETS cudf DESTINATION ${cython_lib_dir}) endif() -if(USE_LIBARROW_FROM_PYARROW) - # TODO: It would be nice to avoid setting these globally, but we'd need to manually set this for - # each of the rapids-cython targets otherwise which seems undesirable. For now I'm setting this - # after the `add_subdirectory(cpp)` call to limit the scope, maybe that's good enough. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0") -endif() - rapids_cython_init() add_subdirectory(cudf/_lib) From cfd45ed6ea2aef146a0b2348de6023be1d52d5b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Nov 2022 11:05:36 -0800 Subject: [PATCH 20/20] Address remaining TODOs. --- cpp/cmake/thirdparty/get_arrow.cmake | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 0115bc772c6..94dcdcb5bc2 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -61,8 +61,6 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION) "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" ) - # TODO: Should these both be here? I think so, the `rapids_find_generate_module` doesn't seems to - # create the necessary `find_dependency` calls in the config files. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) @@ -199,15 +197,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. if(Arrow_DIR) - # TODO: Under what circumstances do we need a `find_package` _after_ `rapids_cpm_find`?? + # This extra find_package is necessary because rapids_cpm_find does not propagate all the + # variables from find_package that we might need. This is especially problematic when + # rapids_cpm_find builds from source. find_package(Arrow REQUIRED QUIET) if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. if(NOT Parquet_DIR) # Set this to enable `find_package(Parquet)` set(Parquet_DIR "${Arrow_DIR}") endif() - # Set this to enable `find_package(ArrowDataset)` TODO: Should this be conditional like the - # Parquet_DIR setting above? + # Set this to enable `find_package(ArrowDataset)` set(ArrowDataset_DIR "${Arrow_DIR}") find_package(ArrowDataset REQUIRED QUIET) endif() @@ -342,12 +342,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - # TODO: Isn't the second call here redundant? The above `rapids_export` command will also encode a - # `find_dependency` in the INSTALL export set. It's less obvious whether we should still include - # the first `rapids_export_package` below since that will translate to a `find_dependency` call in - # the build config and `rapids-export` will put a `CPMFindPackage` there instead; is our goal to - # add `find_dependency`? Since it'll happen after `CPMFindPackage` I think it'll just be - # redundant. rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports)