Skip to content

Commit

Permalink
Add options to build Arrow with Python and Parquet support (#8670)
Browse files Browse the repository at this point in the history
This PR adds two options when CPM builds Arrow during a libcudf source build: `CUDF_ENABLE_ARROW_PYTHON`, and `CUDF_ENABLE_ARROW_PARQUET`.

These options enable building `libarrow.so` with Python and Parquet support, so that we can build the pyarrow and cuDF Cython after building libcudf. For example:

```shell
export PARALLEL_LEVEL=$(nproc --ignore=2)

# Clone cuDF
git clone --depth 1 --branch branch-21.08 https://github.com/rapidsai/cudf.git /opt/rapids/cudf

# Build and install libcudf (also builds libarrow/libarrow_cuda)
cmake -GNinja \
          -S /opt/rapids/cudf/cpp \
          -B /opt/rapids/cudf/cpp/build \
          -D CUDF_ENABLE_ARROW_S3=OFF \
          -D CUDF_ENABLE_ARROW_PYTHON=ON \
          -D CUDF_ENABLE_ARROW_PARQUET=ON \
 && cmake --build /opt/rapids/cudf/cpp/build -j${PARALLEL_LEVEL} -v --target install

# Build and install pyarrow
cd /opt/rapids/cudf/cpp/build/_deps/arrow-src/python \
 && ARROW_HOME=/usr/local \
    PYARROW_WITH_S3=OFF \
    PYARROW_WITH_ORC=ON \
    PYARROW_WITH_CUDA=ON \
    PYARROW_WITH_HDFS=OFF \
    PYARROW_WITH_FLIGHT=OFF \
    PYARROW_WITH_PLASMA=OFF \
    PYARROW_WITH_DATASET=ON \
    PYARROW_WITH_GANDIVA=OFF \
    PYARROW_WITH_PARQUET=ON \
    PYARROW_BUILD_TYPE=Release \
    PYARROW_CMAKE_GENERATOR=Ninja \
    PYARROW_PARALLEL=${PARALLEL_LEVEL} \
    ARROW_PYTHON_DIR=/opt/rapids/cudf/cpp/build/_deps/arrow-src/python \
 && python setup.py install --single-version-externally-managed --record=record.txt

# Build and install cudf python
cd /opt/rapids/cudf/python/cudf \
 && pip install --upgrade \
    "nvtx>=0.2.1" \
    "numba>=0.53.1" \
    "fsspec>=0.6.0" \
    "protobuf>=3.0.0" \
    "fastavro>=0.22.9" \
    "transformers>=4.8" \
    "pandas>=1.0,<1.3.0dev0" \
    "cmake-setuptools>=0.1.3" \
    "cupy-cuda112>7.1.0,<10.0.0a0" \
    "git+https://github.com/dask/dask.git@main" \
    "git+https://github.com/dask/distributed.git@main" \
    "git+https://github.com/rapidsai/[email protected]" \
 && python setup.py build_ext -j${PARALLEL_LEVEL} --inplace \
 && python setup.py install --single-version-externally-managed --record=record.txt

 # Build and install dask_cudf python
cd /opt/rapids/cudf/python/dask_cudf \
 && python setup.py build_ext -j${PARALLEL_LEVEL} --inplace \
 && python setup.py install --single-version-externally-managed --record=record.txt
```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: #8670
  • Loading branch information
trxcllnt authored Jul 8, 2021
1 parent 569282b commit 73df850
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 7 deletions.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks"
option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON)
option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF)
Expand Down
39 changes: 32 additions & 7 deletions cpp/cmake/thirdparty/CUDF_GetArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
# limitations under the License.
#=============================================================================

function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET)

set(ARROW_BUILD_SHARED ON)
set(ARROW_BUILD_STATIC OFF)
set(ARROW_BUILD_S3 OFF)
set(CPMAddOrFindPackage CPMFindPackage)

if(NOT ARROW_ARMV8_ARCH)
Expand All @@ -36,10 +35,23 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
set(CPMAddOrFindPackage CPMAddPackage)
endif()

if(ENABLE_S3)
set(ARROW_BUILD_S3 ON)
set(ARROW_PYTHON_OPTIONS "")
if(ENABLE_PYTHON)
list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
# Arrow's logic to build Boost from source is busted, so we have to get it from the system.
list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
# Arrow's logic to find Thrift is busted, so we have to build it from
# source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask?
# Because that's _also_ busted. The only thing that seems to is to set
# _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to
# SYSTEM.
list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED")
endif()

# Set this so Arrow correctly finds the CUDA toolkit when the build machine
# does not have the CUDA driver installed. This must be an env var.
set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")

cmake_language(CALL ${CPMAddOrFindPackage}
NAME Arrow
VERSION ${VERSION}
Expand All @@ -55,7 +67,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
"ARROW_WITH_BACKTRACE ON"
"ARROW_CXXFLAGS -w"
"ARROW_JEMALLOC OFF"
"ARROW_S3 ${ARROW_BUILD_S3}"
"ARROW_S3 ${ENABLE_S3}"
# e.g. needed by blazingsql-io
"ARROW_PARQUET ${ENABLE_PARQUET}"
${ARROW_PYTHON_OPTIONS}
# Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
"ARROW_USE_CCACHE OFF"
"ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
Expand Down Expand Up @@ -98,13 +113,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util")
file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu")
if(ENABLE_PARQUET)
file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet")
endif()
###
# This shouldn't be necessary!
#
# Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static`
# and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively,
# so for static source-builds, we have to do it after-the-fact.
#
#
# This only works because we know exactly which components we're using.
# Don't forget to update this list if we add more!
###
Expand All @@ -129,4 +148,10 @@ endfunction()

set(CUDF_VERSION_Arrow 4.0.1)

find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3})
find_and_configure_arrow(
${CUDF_VERSION_Arrow}
${CUDF_USE_ARROW_STATIC}
${CUDF_ENABLE_ARROW_S3}
${CUDF_ENABLE_ARROW_PYTHON}
${CUDF_ENABLE_ARROW_PARQUET}
)

0 comments on commit 73df850

Please sign in to comment.