Skip to content

Commit

Permalink
ARROW-12699: [CI][Packaging][Java] Generate a jar compatible with Lin…
Browse files Browse the repository at this point in the history
…ux and MacOS for all Arrow components

Change the build to generate the Arrow's libraries jar files containing the C++ shared libs both for Linux and macOS.

**Note**: It only generates the artifact jars for the components that depend on C++ libraries at the end of the build: gandiva, adapter/orc, and dataset.

Closes #10300 from anthonylouisbsb/feature/generate-single-jar-for-all-jar-libraries

Authored-by: Anthony Louis <[email protected]>
Signed-off-by: Krisztián Szűcs <[email protected]>
  • Loading branch information
anthonylouisbsb authored and kszucs committed May 14, 2021
1 parent b01bcf2 commit 527c346
Show file tree
Hide file tree
Showing 11 changed files with 363 additions and 209 deletions.
33 changes: 18 additions & 15 deletions dev/tasks/gandiva-jars/build-java.sh → ci/docker/java-bundled-jars.dockerfile
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand All @@ -16,19 +14,24 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
ARG base
FROM ${base}

set -e

CPP_BUILD_DIR=$GITHUB_WORKSPACE/arrow/dist/
# Install the libaries required by the Gandiva to run
RUN vcpkg install --clean-after-build \
llvm \
boost-system \
boost-date-time \
boost-regex \
boost-predef \
boost-algorithm \
boost-locale \
boost-format \
boost-variant \
boost-multiprecision

pushd java
# build the entire project
mvn clean install -q -DskipTests -P arrow-jni -Darrow.cpp.build.dir=$CPP_BUILD_DIR
# test only gandiva
mvn test -q -P arrow-jni -pl gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR
# Install dependencies
ARG java=1.8.0
RUN yum install -y java-$java-openjdk-devel && yum clean all

if [[ $COPY_JAR_TO_DISTRIBUTION_FOLDER ]] ; then
# copy the jars to distribution folder
find gandiva/target/ -name "*.jar" -not -name "*tests*" -exec cp {} $CPP_BUILD_DIR \;
fi
popd
ENV JAVA_HOME=/usr/lib/jvm/java-$java-openjdk/
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,34 @@

set -e

CPP_BUILD_DIR=$GITHUB_WORKSPACE/arrow/dist/
function check_dynamic_dependencies(){
local so_dep=$1
local library=$2
shift 2
local whitelist=("$@")

if [[ $OS_NAME == "linux" ]]; then
SO_DEP=ldd
GANDIVA_LIB="$CPP_BUILD_DIR"libgandiva_jni.so
WHITELIST=(linux-vdso libz librt libdl libpthread libstdc++ libm libgcc_s libc ld-linux-x86-64)
else
SO_DEP="otool -L"
GANDIVA_LIB="$CPP_BUILD_DIR"libgandiva_jni.dylib
WHITELIST=(libgandiva_jni libz libncurses libSystem libc++)
fi
# print the shared library dependencies
$so_dep "$library" | tee dependencies_temp_file.txt

# print the shared library dependencies
$SO_DEP "$GANDIVA_LIB" | tee dependencies_temp_file.txt

if [[ $CHECK_SHARED_DEPENDENCIES ]] ; then
# exit if any shared library not in whitelisted set is found
echo "Checking shared dependencies"

awk '{print $1}' dependencies_temp_file.txt | \
while read -r line
do
found=false
for item in "${WHITELIST[@]}"

for item in "${whitelist[@]}"
do
if [[ "$line" == *"$item"* ]] ; then
found=true
found=true
fi
done
done

if [[ "$found" == false ]] ; then
echo "Unexpected shared dependency found $line"
exit 1
fi
if [[ "$found" == false ]] ; then
echo "Unexpected shared dependency found in $library : $line"
exit 1
fi
done
fi

rm dependencies_temp_file.txt
}
41 changes: 41 additions & 0 deletions ci/scripts/java_bundled_jars_java_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -e

arrow_dir=${1}
cpp_build_dir=${2}
copy_jar_to_distribution_folder=${3:-true}
java_dir=${arrow_dir}/java

export ARROW_TEST_DATA=${arrow_dir}/testing/data

pushd $java_dir
# build the entire project
mvn clean install -DskipTests -P arrow-jni -Darrow.cpp.build.dir=$cpp_build_dir
# test jars that have cpp dependencies
mvn test -P arrow-jni -pl adapter/orc,gandiva,dataset -Dgandiva.cpp.build.dir=$cpp_build_dir

if [[ $copy_jar_to_distribution_folder ]] ; then
# copy the jars that has cpp dependencies to distribution folder
find gandiva/target/ -name "*.jar" -not -name "*tests*" -exec cp {} $cpp_build_dir \;
find adapter/orc/target/ -name "*.jar" -not -name "*tests*" -exec cp {} $cpp_build_dir \;
find dataset/target/ -name "*.jar" -not -name "*tests*" -exec cp {} $cpp_build_dir \;
fi
popd
92 changes: 92 additions & 0 deletions ci/scripts/java_bundled_jars_macos_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -ex

arrow_dir=${1}
build_dir=${2}
# The directory where the final binaries will be stored when scripts finish
distribution_dir=${3}
source_dir=${arrow_dir}/cpp

export ARROW_TEST_DATA="${arrow_dir}/testing/data"
export PARQUET_TEST_DATA="${source_dir}/submodules/parquet-testing/data"
export AWS_EC2_METADATA_DISABLED=TRUE

# Builds arrow + gandiva and tests the same.
mkdir -p "${build_dir}"
pushd "${build_dir}"
CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Release \
-DARROW_GANDIVA=ON \
-DARROW_GANDIVA_JAVA=ON \
-DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \
-DARROW_ORC=ON \
-DARROW_JNI=ON \
-DARROW_PLASMA=ON \
-DARROW_PLASMA_JAVA_CLIENT=ON \
-DARROW_BUILD_TESTS=ON \
-DARROW_BUILD_UTILITIES=OFF \
-DPARQUET_REQUIRE_ENCRYPTION=OFF \
-DARROW_PARQUET=ON \
-DPARQUET_BUILD_EXAMPLES=OFF \
-DPARQUET_BUILD_EXECUTABLES=OFF \
-DARROW_FILESYSTEM=ON \
-DARROW_DATASET=ON \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_PROTOBUF_USE_SHARED=OFF \
-DARROW_GFLAGS_USE_SHARED=OFF \
-DARROW_OPENSSL_USE_SHARED=OFF \
-DARROW_BROTLI_USE_SHARED=OFF \
-DARROW_BZ2_USE_SHARED=OFF \
-DARROW_GRPC_USE_SHARED=OFF \
-DARROW_LZ4_USE_SHARED=OFF \
-DARROW_SNAPPY_USE_SHARED=OFF \
-DARROW_THRIFT_USE_SHARED=OFF \
-DARROW_UTF8PROC_USE_SHARED=OFF \
-DARROW_ZSTD_USE_SHARED=OFF \
-DCMAKE_INSTALL_PREFIX=${build_dir} \
-DCMAKE_INSTALL_LIBDIR=lib"

cmake $CMAKE_FLAGS $source_dir
make -j4
make install
ctest

# Copy all generated libraries to the distribution folder
mkdir -p "${distribution_dir}"
cp -L ${build_dir}/lib/libgandiva_jni.dylib ${distribution_dir}
cp -L ${build_dir}/lib/libarrow_dataset_jni.dylib ${distribution_dir}
cp -L ${build_dir}/lib/libarrow_orc_jni.dylib ${distribution_dir}
popd

#Check if any libraries contains an unwhitelisted shared dependency
source $arrow_dir/ci/scripts/java_bundled_jars_check_dependencies.sh
SO_DEP="otool -L"

GANDIVA_LIB=$distribution_dir/libgandiva_jni.dylib
DATASET_LIB=$distribution_dir/libarrow_dataset_jni.dylib
ORC_LIB=$distribution_dir/libarrow_orc_jni.dylib
LIBRARIES=($GANDIVA_LIB $ORC_LIB $DATASET_LIB)

WHITELIST=(libgandiva_jni libarrow_orc_jni libarrow_dataset_jni libz libncurses libSystem libc++)

for library in "${LIBRARIES[@]}"
do
check_dynamic_dependencies $SO_DEP $library "${WHITELIST[@]}"
done
124 changes: 124 additions & 0 deletions ci/scripts/java_bundled_jars_manylinux_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Quit on failure
set -e

arrow_dir=${1}
build_dir=${2}
# The directory where the final binaries will be stored when scripts finish
distribution_dir=${3}
source_dir=${arrow_dir}/cpp

echo "=== (${PYTHON_VERSION}) Clear output directories and leftovers ==="
# Clear output directories and leftovers
rm -rf ${build_dir}

echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ==="
: ${ARROW_DATASET:=ON}
: ${ARROW_GANDIVA:=ON}
: ${ARROW_GANDIVA_JAVA:=ON}
: ${ARROW_FILESYSTEM:=ON}
: ${ARROW_JEMALLOC:=ON}
: ${ARROW_RPATH_ORIGIN:=ON}
: ${ARROW_ORC:=ON}
: ${ARROW_PARQUET:=ON}
: ${ARROW_PLASMA:=ON}
: ${ARROW_PLASMA_JAVA_CLIENT:=ON}
: ${ARROW_PYTHON:=OFF}
: ${ARROW_JNI:=ON}
: ${ARROW_BUILD_TESTS:=ON}
: ${CMAKE_BUILD_TYPE:=Release}
: ${CMAKE_UNITY_BUILD:=ON}
: ${CMAKE_GENERATOR:=Ninja}
: ${VCPKG_FEATURE_FLAGS:=-manifests}
: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}}
: ${PYTHON_VERSION:=3.7}
: ${GANDIVA_CXX_FLAGS:=-isystem;/opt/rh/devtoolset-9/root/usr/include/c++/9;-isystem;/opt/rh/devtoolset-9/root/usr/include/c++/9/x86_64-redhat-linux;-isystem;-lpthread}

mkdir -p "${build_dir}"
pushd "${build_dir}"
export ARROW_TEST_DATA="${arrow_dir}/testing/data"
export PARQUET_TEST_DATA="${source_dir}/submodules/parquet-testing/data"
export AWS_EC2_METADATA_DISABLED=TRUE

cmake -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DARROW_DEPENDENCY_SOURCE="VCPKG" \
-DCMAKE_INSTALL_PREFIX=${build_dir} \
-DCMAKE_INSTALL_LIBDIR=lib \
-DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \
-DARROW_BUILD_SHARED=ON \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_PROTOBUF_USE_SHARED=OFF \
-DARROW_OPENSSL_USE_SHARED=OFF \
-DARROW_BROTLI_USE_SHARED=OFF \
-DARROW_BZ2_USE_SHARED=OFF \
-DARROW_GRPC_USE_SHARED=OFF \
-DARROW_LZ4_USE_SHARED=OFF \
-DARROW_SNAPPY_USE_SHARED=OFF \
-DARROW_THRIFT_USE_SHARED=OFF \
-DARROW_UTF8PROC_USE_SHARED=OFF \
-DARROW_ZSTD_USE_SHARED=OFF \
-DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \
-DARROW_JEMALLOC=${ARROW_JEMALLOC} \
-DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \
-DARROW_PYTHON=${ARROW_PYTHON} \
-DARROW_PARQUET=${ARROW_PARQUET} \
-DARROW_DATASET=${ARROW_DATASET} \
-DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \
-DPARQUET_REQUIRE_ENCRYPTION=OFF \
-DPARQUET_BUILD_EXAMPLES=OFF \
-DPARQUET_BUILD_EXECUTABLES=OFF \
-DPythonInterp_FIND_VERSION=ON \
-DPythonInterp_FIND_VERSION_MAJOR=3 \
-DARROW_GANDIVA=${ARROW_GANDIVA} \
-DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \
-DARROW_ORC=${ARROW_ORC} \
-DARROW_JNI=${ARROW_JNI} \
-DARROW_PLASMA=${ARROW_PLASMA} \
-DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \
-DARROW_BUILD_UTILITIES=OFF \
-DVCPKG_MANIFEST_MODE=OFF \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
-GNinja ${source_dir}
ninja install
CTEST_OUTPUT_ON_FAILURE=1 ninja test
popd

echo "=== (${PYTHON_VERSION}) Copying libraries to the distribution folder ==="
mkdir -p "${distribution_dir}"
cp -L ${build_dir}/lib/libgandiva_jni.so ${distribution_dir}
cp -L ${build_dir}/lib/libarrow_dataset_jni.so ${distribution_dir}
cp -L ${build_dir}/lib/libarrow_orc_jni.so ${distribution_dir}

echo "=== (${PYTHON_VERSION}) Checking shared dependencies for libraries ==="
source $arrow_dir/ci/scripts/java_bundled_jars_check_dependencies.sh
SO_DEP=ldd

GANDIVA_LIB=$distribution_dir/libgandiva_jni.so
DATASET_LIB=$distribution_dir/libarrow_dataset_jni.so
ORC_LIB=$distribution_dir/libarrow_orc_jni.so
LIBRARIES=($GANDIVA_LIB $ORC_LIB $DATASET_LIB)

WHITELIST=(linux-vdso libz librt libdl libpthread libstdc++ libm libgcc_s libc ld-linux-x86-64)

for library in "${LIBRARIES[@]}"
do
check_dynamic_dependencies $SO_DEP $library "${WHITELIST[@]}"
done
Loading

0 comments on commit 527c346

Please sign in to comment.