Skip to content

Commit

Permalink
Use C++ to parse and filter parquet footers. (#199)
Browse files Browse the repository at this point in the history
Signed-off-by: Robert (Bobby) Evans <[email protected]>
  • Loading branch information
revans2 authored May 6, 2022
1 parent f39885d commit 4b8d8f8
Show file tree
Hide file tree
Showing 6 changed files with 776 additions and 5 deletions.
1 change: 1 addition & 0 deletions build-libcudf.xml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
<arg value="-DCMAKE_INSTALL_PREFIX=${libcudf.install.path}"/>
<arg value="-DCUDA_STATIC_RUNTIME=ON"/>
<arg value="-DCUDF_ENABLE_ARROW_S3=OFF"/>
<arg value="-DCUDF_ENABLE_ARROW_PARQUET=ON"/>
<arg value="-DCUDF_USE_ARROW_STATIC=ON"/>
<arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
<arg value="-DRMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL}" />
Expand Down
10 changes: 5 additions & 5 deletions build/build-in-docker
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ LOCAL_MAVEN_REPO=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
PER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM:-ON}
USE_GDS=${USE_GDS:-ON}

IMAGE_NAME="cudf-build:${CUDA_VERSION}-devel-centos7"
SPARK_IMAGE_NAME="spark-rapids-jni-build:${CUDA_VERSION}-devel-centos7"
WORKSPACE_DIR=/rapids
WORKSPACE_REPODIR="$WORKSPACE_DIR/spark-rapids-jni"
WORKSPACE_MAVEN_REPODIR="$WORKSPACE_DIR/.m2/repository"
Expand All @@ -41,10 +41,10 @@ if (( $# == 0 )); then
exit 1
fi

$DOCKER_CMD build -f $REPODIR/thirdparty/cudf/java/ci/Dockerfile.centos7 \
$DOCKER_CMD build -f $REPODIR/ci/Dockerfile \
--build-arg CUDA_VERSION=$CUDA_VERSION \
-t $IMAGE_NAME \
$REPODIR/thirdparty/cudf/java/ci
-t $SPARK_IMAGE_NAME \
$REPODIR/build

$DOCKER_CMD run -it -u $(id -u):$(id -g) --rm \
-v "/etc/group:/etc/group:ro" \
Expand All @@ -58,7 +58,7 @@ $DOCKER_CMD run -it -u $(id -u):$(id -g) --rm \
-e CUDA_VISIBLE_DEVICES \
-e PARALLEL_LEVEL \
-e VERBOSE \
$IMAGE_NAME \
$SPARK_IMAGE_NAME \
scl enable devtoolset-9 "mvn \
-Dmaven.repo.local=$WORKSPACE_MAVEN_REPODIR \
-DPER_THREAD_DEFAULT_STREAM=$PER_THREAD_DEFAULT_STREAM \
Expand Down
10 changes: 10 additions & 0 deletions ci/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,13 @@ RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/dow
tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH

## install a version of boost that is needed for arrow/parquet to work
RUN cd /usr/local && wget https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && \
tar -xzf boost_1_79_0.tar.gz && \
rm boost_1_79_0.tar.gz && \
cd boost_1_79_0 && \
./bootstrap.sh --prefix=/usr/local && \
./b2 install --prefix=/usr/local --with-filesystem --with-system && \
cd /usr/local && \
rm -rf boost_1_79_0
21 changes: 21 additions & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,27 @@ find_library(CUDFJNI_LIB "libcudfjni.a" REQUIRED NO_DEFAULT_PATH
HINTS "${PROJECT_BINARY_DIR}/../libcudfjni"
)

# parquet
find_library(PARQUET_LIB "libparquet.a" REQUIRED NO_DEFAULT_PATH
HINTS "${PROJECT_BINARY_DIR}/../libcudf-install/lib64"
)

# Internal parquet headers
set (GENERATED_PARQUET_INCLUDE
"${CUDF_DIR}/cpp/build/_deps/arrow-src/cpp/src/"
CACHE STRING "generated parquet thrift headers"
)

# thrift
find_library(THRIFT_LIB "libthrift.a" REQUIRED NO_DEFAULT_PATH
HINTS "${CUDF_DIR}/cpp/build/_deps/arrow-build/thrift_ep-install/lib/"
)

set(CUDFJNI_INCLUDE_DIRS
"${CUDF_DIR}/java/src/main/native/include"
"${CUDF_DIR}/java/src/main/native/src"
"${GENERATED_PARQUET_INCLUDE}"
"${CUDF_DIR}/cpp/build/_deps/arrow-build/thrift_ep-install/include/"
)

# ##################################################################################################
Expand All @@ -113,6 +131,7 @@ set(CUDFJNI_INCLUDE_DIRS
add_library(
spark_rapids_jni SHARED
src/RowConversionJni.cpp
src/NativeParquetJni.cpp
src/row_conversion.cu
)

Expand Down Expand Up @@ -159,6 +178,8 @@ target_link_libraries(
-Wl,--whole-archive
${CUDFJNI_LIB}
cudf::cudf
${PARQUET_LIB}
${THRIFT_LIB}
-Wl,--no-whole-archive
cudf::cudf
)
Expand Down
Loading

0 comments on commit 4b8d8f8

Please sign in to comment.