Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into test-pyarrow-device…
Browse files Browse the repository at this point in the history
…-interface
  • Loading branch information
jorisvandenbossche committed Feb 28, 2024
2 parents d64f0e0 + 5ce060a commit 6e0870f
Show file tree
Hide file tree
Showing 197 changed files with 6,340 additions and 1,390 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev_pr/link.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async function commentJIRAURL(github, context, pullRequestNumber, jiraID) {
async function commentGitHubURL(github, context, pullRequestNumber, issueID) {
// Make the call to ensure issue exists before adding comment
const issueInfo = await helpers.getGitHubInfo(github, context, issueID, pullRequestNumber);
const message = "* Closes: #" + issueInfo.number
const message = "* GitHub Issue: #" + issueInfo.number
if (issueInfo) {
const body = context.payload.pull_request.body || "";
if (body.includes(message)) {
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ jobs:
timeout-minutes: 60
env:
ARROW_HOME: /usr/local
ARROW_AZURE: ON
ARROW_DATASET: ON
ARROW_FLIGHT: ON
ARROW_GANDIVA: ON
Expand Down
2 changes: 1 addition & 1 deletion ci/conda_env_archery.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jira
pygit2
pygithub
ruamel.yaml
setuptools_scm<8.0.0
setuptools_scm
toolz

# benchmark
Expand Down
6 changes: 6 additions & 0 deletions ci/conda_env_cpp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
# under the License.

aws-sdk-cpp=1.11.68
azure-core-cpp>=1.10.3
azure-identity-cpp>=1.6.0
azure-storage-blobs-cpp>=12.10.0
azure-storage-common-cpp>=12.5.0
azure-storage-files-datalake-cpp>=12.9.0
benchmark>=1.6.0
boost-cpp>=1.68.0
brotli
Expand All @@ -34,6 +39,7 @@ libutf8proc
lz4-c
make
ninja
nodejs
orc
pkg-config
python
Expand Down
2 changes: 1 addition & 1 deletion ci/conda_env_crossbow.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ jinja2
jira
pygit2
ruamel.yaml
setuptools_scm<8.0.0
setuptools_scm
toolz
2 changes: 1 addition & 1 deletion ci/conda_env_python.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ pytest<8
pytest-faulthandler
s3fs>=2023.10.0
setuptools
setuptools_scm<8.0.0
setuptools_scm
8 changes: 8 additions & 0 deletions ci/docker/conda-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ RUN mamba install -q -y \
valgrind && \
mamba clean --all

# Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to
# be on the path for the tests to run.
ENV PATH=/opt/conda/envs/arrow/bin:$PATH

COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_azurite.sh

# We want to install the GCS testbench using the same Python binary that the Conda code will use.
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
RUN /arrow/ci/scripts/install_gcs_testbench.sh default
Expand All @@ -50,6 +57,7 @@ COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin

ENV ARROW_ACERO=ON \
ARROW_AZURE=ON \
ARROW_BUILD_TESTS=ON \
ARROW_DATASET=ON \
ARROW_DEPENDENCY_SOURCE=CONDA \
Expand Down
2 changes: 1 addition & 1 deletion ci/docker/conda-python.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ COPY ci/conda_env_python.txt \
RUN mamba install -q -y \
--file arrow/ci/conda_env_python.txt \
$([ "$python" == $(gdb --batch --eval-command 'python import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') ] && echo "gdb") \
python=${python} \
"python=${python}.*=*_cpython" \
nomkl && \
mamba clean --all

Expand Down
1 change: 1 addition & 0 deletions ci/scripts/cpp_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ ctest \
--label-regex unittest \
--output-on-failure \
--parallel ${n_jobs} \
--repeat until-pass:3 \
--timeout ${ARROW_CTEST_TIMEOUT:-300} \
"${ctest_options[@]}" \
"$@"
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/python_sdist_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data
export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja}
export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug}
export PYARROW_WITH_ACERO=${ARROW_ACERO:-ON}
export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF}
export PYARROW_WITH_S3=${ARROW_S3:-OFF}
export PYARROW_WITH_ORC=${ARROW_ORC:-OFF}
export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF}
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/python_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ fi
: ${PYARROW_TEST_S3:=${ARROW_S3:-ON}}

export PYARROW_TEST_ACERO
export PYARROW_TEST_AZURE
export PYARROW_TEST_CUDA
export PYARROW_TEST_DATASET
export PYARROW_TEST_FLIGHT
Expand Down
6 changes: 2 additions & 4 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,6 @@ if(ARROW_USE_GLOG)
if(GLOG_SOURCE STREQUAL "SYSTEM")
list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS glog::glog)
endif()
add_definitions("-DARROW_USE_GLOG")
endif()

if(ARROW_S3)
Expand Down Expand Up @@ -848,8 +847,8 @@ if(ARROW_WITH_RE2)
endif()

if(ARROW_WITH_RAPIDJSON)
list(APPEND ARROW_SHARED_LINK_LIBS rapidjson::rapidjson)
list(APPEND ARROW_STATIC_LINK_LIBS rapidjson::rapidjson)
list(APPEND ARROW_SHARED_LINK_LIBS RapidJSON)
list(APPEND ARROW_STATIC_LINK_LIBS RapidJSON)
endif()

if(ARROW_USE_XSIMD)
Expand Down Expand Up @@ -953,7 +952,6 @@ if(ARROW_JEMALLOC)
endif()

if(ARROW_MIMALLOC)
add_definitions(-DARROW_MIMALLOC)
list(APPEND ARROW_SHARED_LINK_LIBS mimalloc::mimalloc)
list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc)
endif()
Expand Down
8 changes: 7 additions & 1 deletion cpp/cmake_modules/FindGLOG.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
#
# find_package(GLOG)

find_package(glog CONFIG)
if(glog_FOUND)
return()
endif()

if(GLOG_FOUND)
return()
endif()
Expand Down Expand Up @@ -56,5 +61,6 @@ if(GLOG_FOUND)
add_library(glog::glog UNKNOWN IMPORTED)
set_target_properties(glog::glog
PROPERTIES IMPORTED_LOCATION "${GLOG_LIB}"
INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIR}")
INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIR}"
INTERFACE_COMPILE_DEFINITIONS "GLOG_USE_GLOG_EXPORT")
endif()
20 changes: 19 additions & 1 deletion cpp/cmake_modules/FindRapidJSONAlt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@ endif()
find_package(RapidJSON ${find_package_args})
if(RapidJSON_FOUND)
set(RapidJSONAlt_FOUND TRUE)
set(RAPIDJSON_INCLUDE_DIR ${RAPIDJSON_INCLUDE_DIRS})
if(NOT TARGET RapidJSON)
add_library(RapidJSON INTERFACE IMPORTED)
if(RapidJSON_INCLUDE_DIRS)
target_include_directories(RapidJSON INTERFACE "${RapidJSON_INCLUDE_DIRS}")
else()
target_include_directories(RapidJSON INTERFACE "${RAPIDJSON_INCLUDE_DIRS}")
endif()
endif()
return()
endif()

Expand Down Expand Up @@ -74,3 +81,14 @@ find_package_handle_standard_args(
RapidJSONAlt
REQUIRED_VARS RAPIDJSON_INCLUDE_DIR
VERSION_VAR RAPIDJSON_VERSION)

if(RapidJSONAlt_FOUND)
if(WIN32 AND "${RAPIDJSON_INCLUDE_DIR}" MATCHES "^/")
# MSYS2
execute_process(COMMAND "cygpath" "--windows" "${RAPIDJSON_INCLUDE_DIR}"
OUTPUT_VARIABLE RAPIDJSON_INCLUDE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
add_library(RapidJSON INTERFACE IMPORTED)
target_include_directories(RapidJSON INTERFACE "${RAPIDJSON_INCLUDE_DIR}")
endif()
22 changes: 3 additions & 19 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
include(ProcessorCount)
processorcount(NPROC)

add_custom_target(rapidjson)
add_custom_target(toolchain)
add_custom_target(toolchain-benchmarks)
add_custom_target(toolchain-tests)
Expand Down Expand Up @@ -2328,9 +2327,9 @@ macro(build_rapidjson)
# The include directory must exist before it is referenced by a target.
file(MAKE_DIRECTORY "${RAPIDJSON_INCLUDE_DIR}")

add_dependencies(toolchain rapidjson_ep)
add_dependencies(toolchain-tests rapidjson_ep)
add_dependencies(rapidjson rapidjson_ep)
add_library(RapidJSON INTERFACE IMPORTED)
target_include_directories(RapidJSON INTERFACE "${RAPIDJSON_INCLUDE_DIR}")
add_dependencies(RapidJSON rapidjson_ep)

set(RAPIDJSON_VENDORED TRUE)
endmacro()
Expand All @@ -2344,19 +2343,6 @@ if(ARROW_WITH_RAPIDJSON)
${ARROW_RAPIDJSON_REQUIRED_VERSION}
IS_RUNTIME_DEPENDENCY
FALSE)

if(RapidJSON_INCLUDE_DIR)
set(RAPIDJSON_INCLUDE_DIR "${RapidJSON_INCLUDE_DIR}")
endif()
if(WIN32 AND "${RAPIDJSON_INCLUDE_DIR}" MATCHES "^/")
# MSYS2
execute_process(COMMAND "cygpath" "--windows" "${RAPIDJSON_INCLUDE_DIR}"
OUTPUT_VARIABLE RAPIDJSON_INCLUDE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()

add_library(rapidjson::rapidjson INTERFACE IMPORTED)
target_include_directories(rapidjson::rapidjson INTERFACE "${RAPIDJSON_INCLUDE_DIR}")
endif()

macro(build_xsimd)
Expand Down Expand Up @@ -2599,7 +2585,6 @@ if(ARROW_WITH_RE2)
TRUE
PC_PACKAGE_NAMES
re2)
add_definitions(-DARROW_WITH_RE2)
endif()

macro(build_bzip2)
Expand Down Expand Up @@ -2707,7 +2692,6 @@ if(ARROW_WITH_UTF8PROC)
libutf8proc
REQUIRED_VERSION
"2.2.0")
add_definitions(-DARROW_WITH_UTF8PROC)
endif()

macro(build_cares)
Expand Down
6 changes: 4 additions & 2 deletions cpp/gdb_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,10 +956,12 @@ def storage_type(self):

def to_string(self):
"""
The result of calling ToString().
The result of calling ToString(show_metadata=True).
"""
# XXX `show_metadata` is an optional argument, but gdb doesn't allow
# omitting it.
return StdString(gdb.parse_and_eval(
f"{for_evaluation(self.val)}.ToString()"))
f"{for_evaluation(self.val)}.ToString(true)"))


class Schema:
Expand Down
14 changes: 2 additions & 12 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -304,17 +304,14 @@ set_source_files_properties(vendored/datetime/tz.cpp
add_definitions(-DURI_STATIC_BUILD)

if(ARROW_WITH_BROTLI)
add_definitions(-DARROW_WITH_BROTLI)
list(APPEND ARROW_SRCS util/compression_brotli.cc)
endif()

if(ARROW_WITH_BZ2)
add_definitions(-DARROW_WITH_BZ2)
list(APPEND ARROW_SRCS util/compression_bz2.cc)
endif()

if(ARROW_WITH_LZ4)
add_definitions(-DARROW_WITH_LZ4)
list(APPEND ARROW_SRCS util/compression_lz4.cc)
endif()

Expand All @@ -323,23 +320,20 @@ if(ARROW_WITH_OPENTELEMETRY)
endif()

if(ARROW_WITH_SNAPPY)
add_definitions(-DARROW_WITH_SNAPPY)
list(APPEND ARROW_SRCS util/compression_snappy.cc)
endif()

if(ARROW_WITH_ZLIB)
add_definitions(-DARROW_WITH_ZLIB)
list(APPEND ARROW_SRCS util/compression_zlib.cc)
endif()

if(ARROW_WITH_ZSTD)
add_definitions(-DARROW_WITH_ZSTD)
list(APPEND ARROW_SRCS util/compression_zstd.cc)
endif()

set(ARROW_TESTING_SHARED_LINK_LIBS arrow::flatbuffers rapidjson::rapidjson arrow_shared
set(ARROW_TESTING_SHARED_LINK_LIBS arrow::flatbuffers RapidJSON arrow_shared
${ARROW_GTEST_GTEST})
set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers rapidjson::rapidjson arrow_static
set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static
${ARROW_GTEST_GTEST})

set(ARROW_TESTING_SRCS
Expand Down Expand Up @@ -493,10 +487,6 @@ if(ARROW_COMPUTE)
endif()

if(ARROW_FILESYSTEM)
if(ARROW_HDFS)
add_definitions(-DARROW_HDFS)
endif()

list(APPEND
ARROW_SRCS
filesystem/filesystem.cc
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/arrow/acero/asof_join_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,8 @@ static void AsOfJoinOverhead(benchmark::State& state) {
// this generates the set of right hand tables to test on.
void SetArgs(benchmark::internal::Benchmark* bench) {
bench
->ArgNames({"left_freq", "left_cols", "left_ids", "left_batch_size",
"num_right_tables", "right_freq", "right_cols", "right_ids",
"right_batch_size"})
->ArgNames({"left_freq", "left_cols", "left_ids", "batch_size", "num_right_tables",
"right_freq", "right_cols", "right_ids"})
->UseRealTime();

int default_freq = 400;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/acero/asof_join_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1098,7 +1098,7 @@ class AsofJoinNode : public ExecNode {
auto inputs = this->inputs();
for (size_t i = 0; i < inputs.size(); i++) {
RETURN_NOT_OK(key_hashers_[i]->Init(plan()->query_context()->exec_context(),
output_schema()));
inputs[i]->output_schema()));
ARROW_ASSIGN_OR_RAISE(
auto input_state,
InputState::Make(i, tolerance_, must_hash_, may_rehash_, key_hashers_[i].get(),
Expand Down
Loading

0 comments on commit 6e0870f

Please sign in to comment.