diff --git a/.env b/.env index eb87dc62bdd8c..00c238421d301 100644 --- a/.env +++ b/.env @@ -49,7 +49,7 @@ ULIMIT_CORE=-1 ALMALINUX=8 ALPINE_LINUX=3.16 DEBIAN=11 -FEDORA=38 +FEDORA=39 UBUNTU=20.04 # Default versions for various dependencies diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index fc939693c369c..dbcbbff54953c 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -51,7 +51,7 @@ jobs: ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} run: | - archery trigger-bot \ + archery --debug trigger-bot \ --event-name ${{ github.event_name }} \ --event-payload ${{ github.event_path }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 3d1f75ede4bb5..8c47915b7b6d3 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -54,6 +54,63 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + ubuntu-minimum-cpp-version: + name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) + runs-on: ubuntu-latest + strategy: + matrix: + include: + - cpp_version: "13.0.0" + steps: + - name: Checkout Arrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + path: src + submodules: recursive + + - name: Install Arrow C++ (${{ matrix.cpp_version }}) + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + # We have to list all packages to avoid version conflicts. + sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \ + libarrow-acero-dev=${{ matrix.cpp_version }}-1 \ + libparquet-dev=${{ matrix.cpp_version }}-1 \ + libarrow-dataset-dev=${{ matrix.cpp_version }}-1 + + - name: Install checkbashisms + run: | + sudo apt-get install devscripts + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + working-directory: src/r + + - uses: r-lib/actions/check-r-package@v2 + with: + working-directory: src/r + env: + LIBARROW_BINARY: "false" + LIBARROW_BUILD: "false" + ARROW_R_VERBOSE_TEST: "true" + ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true" + + - name: Show install output + if: always() + run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true + shell: bash + + ubuntu: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} runs-on: ubuntu-latest diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 5e561a0461ea3..ab85032fe9924 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -26,7 +26,10 @@ git submodule update --init || exit /B set ARROW_TEST_DATA=%CD%\testing\data set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data -set ARROW_DEBUG_MEMORY_POOL=trap +@rem Enable memory debug checks if the env is not set already +IF "%ARROW_DEBUG_MEMORY_POOL%"=="" ( + set ARROW_DEBUG_MEMORY_POOL=trap +) set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% diff --git a/ci/docker/fedora-38-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile similarity index 92% rename from ci/docker/fedora-38-cpp.dockerfile rename to ci/docker/fedora-39-cpp.dockerfile index 2dcc094ee20c5..c8e98bdd00b11 100644 --- a/ci/docker/fedora-38-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch -FROM ${arch}/fedora:38 +FROM ${arch}/fedora:39 ARG arch # install dependencies @@ -76,6 +76,8 @@ RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin +# PYARROW_TEST_GANDIVA=OFF: GH-39695: We need to make LLVM symbols visible in +# Python process explicitly if we use LLVM 17 or later. ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ ARROW_BUILD_TESTS=ON \ @@ -103,4 +105,5 @@ ENV absl_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH + PATH=/usr/lib/ccache/:$PATH \ + PYARROW_TEST_GANDIVA=OFF diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index c51600a1e5920..3d102796b8c00 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -60,7 +60,7 @@ RUN apt-get update -y && \ ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 -ARG maven=3.5.4 +ARG maven=3.6.3 COPY ci/scripts/util_download_apache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/util_download_apache.sh \ "maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz" /opt diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 2831440d5a967..b1d9ed5ab88d9 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -82,6 +82,7 @@ RUN vcpkg install \ --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index cea600191ae05..f8083c7759d8a 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -28,8 +28,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi pushd ${source_dir} diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 69d86e871ac5f..60cab1a9feaba 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -54,6 +54,7 @@ if [ "${GITHUB_ACTIONS:-false}" = "true" ]; then fi if [ "${ARROW_ENABLE_THREADING:-ON}" = "OFF" ]; then + ARROW_AZURE=OFF ARROW_FLIGHT=OFF ARROW_FLIGHT_SQL=OFF ARROW_GCS=OFF diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0c6e1c6ef7057..1d685c51a9326 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -37,8 +37,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib}:${LD_LIBRARY_P # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi ctest_options=() case "$(uname)" in diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index c0a27e6e705e9..9bdcc4d687584 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -55,6 +55,7 @@ export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_ACERO=${ARROW_ACERO:-OFF} +export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 341c2dd0577ef..20ca3300c0538 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -32,11 +32,14 @@ export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py # Enable some checks inside Python itself export PYTHONDEVMODE=1 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # By default, force-test all optional components : ${PYARROW_TEST_ACERO:=${ARROW_ACERO:-ON}} +: ${PYARROW_TEST_AZURE:=${ARROW_AZURE:-ON}} : ${PYARROW_TEST_CUDA:=${ARROW_CUDA:-ON}} : ${PYARROW_TEST_DATASET:=${ARROW_DATASET:-ON}} : ${PYARROW_TEST_FLIGHT:=${ARROW_FLIGHT:-ON}} diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 8123a9fdf1c48..bea5409100770 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -63,6 +63,7 @@ pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -95,6 +96,7 @@ pushd ${build_dir}/build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -148,6 +150,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 58e42fea88088..4d4d4fb694e0b 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -49,6 +49,7 @@ rm -rf /arrow/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -87,6 +88,7 @@ pushd /tmp/arrow-build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -141,6 +143,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 01250ff7ef40c..a25e5c51bddbc 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -28,15 +28,17 @@ fi source_dir=${1} +: ${ARROW_AZURE:=ON} : ${ARROW_FLIGHT:=ON} -: ${ARROW_SUBSTRAIT:=ON} -: ${ARROW_S3:=ON} : ${ARROW_GCS:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} export PYARROW_TEST_ACERO=ON +export PYARROW_TEST_AZURE=${ARROW_AZURE} export PYARROW_TEST_CYTHON=OFF export PYARROW_TEST_DATASET=ON export PYARROW_TEST_FLIGHT=${ARROW_FLIGHT} diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 22ec551edb9fa..72078ab3c06c2 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -72,8 +72,10 @@ export _R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_=TRUE # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # Hack so that texlive2020 doesn't pollute the home dir export TEXMFCONFIG=/tmp/texmf-config diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 4fd6a85fe3966..56c33a4d6378a 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -26,7 +26,9 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi rake -f ${source_dir}/Rakefile BUILD_DIR=${build_dir} USE_BUNDLER=yes diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 99771728ecf18..e86479a7c32fc 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -105,6 +105,16 @@ } ] }, + "azure": { + "description": "Azure blob storage support", + "dependencies": [ + "azure-core-cpp", + "azure-identity-cpp", + "azure-storage-blobs-cpp", + "azure-storage-common-cpp", + "azure-storage-files-datalake-cpp" + ] + }, "orc": { "description": "ORC support", "dependencies": [ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50a85b33d5489..7f2f7812e3cd5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -774,8 +774,7 @@ if(ARROW_ORC) list(APPEND ARROW_SHARED_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) list(APPEND ARROW_STATIC_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) if(ORC_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc - ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc) endif() endif() @@ -824,9 +823,6 @@ if(ARROW_WITH_OPENTELEMETRY) opentelemetry-cpp::ostream_span_exporter opentelemetry-cpp::otlp_http_exporter) endif() - if(Protobuf_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) - endif() list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() @@ -861,6 +857,14 @@ if(ARROW_USE_XSIMD) list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_XSIMD}) endif() +# This should be done after if(ARROW_ORC) and if(ARROW_WITH_OPENTELEMETRY) +# because they depend on Protobuf. +if(ARROW_WITH_PROTOBUF) + if(Protobuf_SOURCE STREQUAL "SYSTEM") + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) + endif() +endif() + add_custom_target(arrow_dependencies) add_custom_target(arrow_benchmark_dependencies) add_custom_target(arrow_test_dependencies) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e9d478f108584..21ac1a09f56e7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -905,6 +905,29 @@ TEST_F(TestArray, TestAppendArraySlice) { } } +// GH-39976: Test out-of-line data size calculation in +// BinaryViewBuilder::AppendArraySlice. +TEST_F(TestArray, TestBinaryViewAppendArraySlice) { + BinaryViewBuilder src_builder(pool_); + ASSERT_OK(src_builder.AppendNull()); + ASSERT_OK(src_builder.Append("long string; not inlined")); + ASSERT_EQ(2, src_builder.length()); + ASSERT_OK_AND_ASSIGN(auto src, src_builder.Finish()); + ASSERT_OK(src->ValidateFull()); + + ArraySpan span; + span.SetMembers(*src->data()); + BinaryViewBuilder dst_builder(pool_); + ASSERT_OK(dst_builder.AppendArraySlice(span, 0, 1)); + ASSERT_EQ(1, dst_builder.length()); + ASSERT_OK(dst_builder.AppendArraySlice(span, 1, 1)); + ASSERT_EQ(2, dst_builder.length()); + ASSERT_OK_AND_ASSIGN(auto dst, dst_builder.Finish()); + ASSERT_OK(dst->ValidateFull()); + + AssertArraysEqual(*src, *dst); +} + TEST_F(TestArray, ValidateBuffersPrimitive) { auto empty_buffer = std::make_shared(""); auto null_buffer = Buffer::FromString("\xff"); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index f85852fa0eda6..7e5721917f3a0 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -54,7 +54,7 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse int64_t out_of_line_total = 0, i = 0; VisitNullBitmapInline( - array.buffers[0].data, array.offset, array.length, array.null_count, + array.buffers[0].data, array.offset + offset, length, array.null_count, [&] { if (!values[i].is_inline()) { out_of_line_total += static_cast(values[i].size()); diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 1ac25e290a814..e981fafe8e780 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -445,16 +445,20 @@ class BlockParsingOperator { num_rows_seen_ += parser->total_num_rows(); } - RETURN_NOT_OK(block.consume_bytes(parsed_size)); + if (block.consume_bytes) { + RETURN_NOT_OK(block.consume_bytes(parsed_size)); + } return ParsedBlock{std::move(parser), block.block_index, static_cast(parsed_size) + block.bytes_skipped}; } + int num_csv_cols() const { return num_csv_cols_; } + private: io::IOContext io_context_; - ParseOptions parse_options_; - int num_csv_cols_; - bool count_rows_; + const ParseOptions parse_options_; + const int num_csv_cols_; + const bool count_rows_; int64_t num_rows_seen_; }; @@ -570,7 +574,6 @@ class ReaderMixin { parse_options_(parse_options), convert_options_(convert_options), count_rows_(count_rows), - num_rows_seen_(count_rows_ ? 1 : -1), input_(std::move(input)) {} protected: @@ -581,6 +584,7 @@ class ReaderMixin { const uint8_t* data = buf->data(); const auto data_end = data + buf->size(); DCHECK_GT(data_end - data, 0); + int64_t num_rows_seen = 1; if (read_options_.skip_rows) { // Skip initial rows (potentially invalid CSV data) @@ -593,14 +597,14 @@ class ReaderMixin { "either file is too short or header is larger than block size"); } if (count_rows_) { - num_rows_seen_ += num_skipped_rows; + num_rows_seen += num_skipped_rows; } } if (read_options_.column_names.empty()) { // Parse one row (either to read column names or to know the number of columns) - BlockParser parser(io_context_.pool(), parse_options_, num_csv_cols_, - num_rows_seen_, 1); + BlockParser parser(io_context_.pool(), parse_options_, /*num_cols=*/-1, + /*first_row=*/num_rows_seen, /*max_num_rows=*/1); uint32_t parsed_size = 0; RETURN_NOT_OK(parser.Parse( std::string_view(reinterpret_cast(data), data_end - data), @@ -627,7 +631,7 @@ class ReaderMixin { // Skip parsed header row data += parsed_size; if (count_rows_) { - ++num_rows_seen_; + ++num_rows_seen; } } } else { @@ -636,14 +640,17 @@ class ReaderMixin { if (count_rows_) { // increase rows seen to skip past rows which will be skipped - num_rows_seen_ += read_options_.skip_rows_after_names; + num_rows_seen += read_options_.skip_rows_after_names; } auto bytes_consumed = data - buf->data(); *rest = SliceBuffer(buf, bytes_consumed); - num_csv_cols_ = static_cast(column_names_.size()); - DCHECK_GT(num_csv_cols_, 0); + int32_t num_csv_cols = static_cast(column_names_.size()); + DCHECK_GT(num_csv_cols, 0); + // Since we know the number of columns, we can instantiate the BlockParsingOperator + parsing_operator_.emplace(io_context_, parse_options_, num_csv_cols, + count_rows_ ? num_rows_seen : -1); RETURN_NOT_OK(MakeConversionSchema()); return bytes_consumed; @@ -691,7 +698,7 @@ class ReaderMixin { if (convert_options_.include_columns.empty()) { // Include all columns in CSV file order - for (int32_t col_index = 0; col_index < num_csv_cols_; ++col_index) { + for (int32_t col_index = 0; col_index < num_csv_cols(); ++col_index) { append_csv_column(column_names_[col_index], col_index); } } else { @@ -719,66 +726,25 @@ class ReaderMixin { return Status::OK(); } - struct ParseResult { - std::shared_ptr parser; - int64_t parsed_bytes; - }; - - Result Parse(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, int64_t block_index, - bool is_final) { - static constexpr int32_t max_num_rows = std::numeric_limits::max(); - auto parser = std::make_shared( - io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows); - - std::shared_ptr straddling; - std::vector views; - if (partial->size() != 0 || completion->size() != 0) { - if (partial->size() == 0) { - straddling = completion; - } else if (completion->size() == 0) { - straddling = partial; - } else { - ARROW_ASSIGN_OR_RAISE( - straddling, ConcatenateBuffers({partial, completion}, io_context_.pool())); - } - views = {std::string_view(*straddling), std::string_view(*block)}; - } else { - views = {std::string_view(*block)}; - } - uint32_t parsed_size; - if (is_final) { - RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size)); - } else { - RETURN_NOT_OK(parser->Parse(views, &parsed_size)); - } - // See BlockParsingOperator for explanation. - const int64_t bytes_before_buffer = partial->size() + completion->size(); - if (static_cast(parsed_size) < bytes_before_buffer) { - return Status::Invalid( - "CSV parser got out of sync with chunker. This can mean the data file " - "contains cell values spanning multiple lines; please consider enabling " - "the option 'newlines_in_values'."); - } + Result Parse(const CSVBlock& block) { + DCHECK(parsing_operator_.has_value()); + return (*parsing_operator_)(block); + } - if (count_rows_) { - num_rows_seen_ += parser->total_num_rows(); - } - return ParseResult{std::move(parser), static_cast(parsed_size)}; + int num_csv_cols() const { + DCHECK(parsing_operator_.has_value()); + return parsing_operator_->num_csv_cols(); } io::IOContext io_context_; - ReadOptions read_options_; - ParseOptions parse_options_; - ConvertOptions convert_options_; - - // Number of columns in the CSV file - int32_t num_csv_cols_ = -1; - // Whether num_rows_seen_ tracks the number of rows seen in the CSV being parsed - bool count_rows_; - // Number of rows seen in the csv. Not used if count_rows is false - int64_t num_rows_seen_; + const ReadOptions read_options_; + const ParseOptions parse_options_; + const ConvertOptions convert_options_; + // Whether to track the number of rows seen in the CSV being parsed + const bool count_rows_; + + std::optional parsing_operator_; + // Column names in the CSV file std::vector column_names_; ConversionSchema conversion_schema_; @@ -822,14 +788,10 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader { return Status::OK(); } - Result ParseAndInsert(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, - int64_t block_index, bool is_final) { - ARROW_ASSIGN_OR_RAISE(auto result, - Parse(partial, completion, block, block_index, is_final)); - RETURN_NOT_OK(ProcessData(result.parser, block_index)); - return result.parsed_bytes; + Status ParseAndInsert(const CSVBlock& block) { + ARROW_ASSIGN_OR_RAISE(auto result, Parse(block)); + RETURN_NOT_OK(ProcessData(result.parser, result.block_index)); + return Status::OK(); } // Trigger conversion of parsed block data @@ -921,8 +883,6 @@ class StreamingReaderImpl : public ReaderMixin, ProcessHeader(first_buffer, &after_header)); bytes_decoded_->fetch_add(header_bytes_consumed); - auto parser_op = - BlockParsingOperator(io_context_, parse_options_, num_csv_cols_, num_rows_seen_); ARROW_ASSIGN_OR_RAISE( auto decoder_op, BlockDecodingOperator::Make(io_context_, convert_options_, conversion_schema_)); @@ -930,8 +890,7 @@ class StreamingReaderImpl : public ReaderMixin, auto block_gen = SerialBlockReader::MakeAsyncIterator( std::move(buffer_generator), MakeChunker(parse_options_), std::move(after_header), read_options_.skip_rows_after_names); - auto parsed_block_gen = - MakeMappedGenerator(std::move(block_gen), std::move(parser_op)); + auto parsed_block_gen = MakeMappedGenerator(std::move(block_gen), *parsing_operator_); auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op)); auto self = shared_from_this(); @@ -1035,11 +994,7 @@ class SerialTableReader : public BaseTableReader { // EOF break; } - ARROW_ASSIGN_OR_RAISE( - int64_t parsed_bytes, - ParseAndInsert(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes)); + RETURN_NOT_OK(ParseAndInsert(maybe_block)); } // Finish conversion, create schema and table RETURN_NOT_OK(task_group_->Finish()); @@ -1110,13 +1065,8 @@ class AsyncThreadedTableReader DCHECK(!maybe_block.consume_bytes); // Launch parse task - self->task_group_->Append([self, maybe_block] { - return self - ->ParseAndInsert(maybe_block.partial, maybe_block.completion, - maybe_block.buffer, maybe_block.block_index, - maybe_block.is_final) - .status(); - }); + self->task_group_->Append( + [self, maybe_block] { return self->ParseAndInsert(maybe_block); }); return Status::OK(); }; @@ -1239,12 +1189,8 @@ class CSVRowCounter : public ReaderMixin, // IterationEnd. std::function>(const CSVBlock&)> count_cb = [self](const CSVBlock& maybe_block) -> Result> { - ARROW_ASSIGN_OR_RAISE( - auto parser, - self->Parse(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parser.parsed_bytes)); - int32_t total_row_count = parser.parser->total_num_rows(); + ARROW_ASSIGN_OR_RAISE(auto parsed_block, self->Parse(maybe_block)); + int32_t total_row_count = parsed_block.parser->total_num_rows(); self->row_count_ += total_row_count; return total_row_count; }; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index af8305a025291..02e0a890e4b3d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -19,6 +19,8 @@ #include #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/tensor_internal.h" +#include "arrow/scalar.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" @@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; } - const auto& other_ext = static_cast(other); + const auto& other_ext = internal::checked_cast(other); auto is_permutation_trivial = [](const std::vector& permutation) { for (size_t i = 1; i < permutation.size(); ++i) { @@ -143,7 +145,7 @@ std::string FixedShapeTensorType::Serialize() const { if (!dim_names_.empty()) { rj::Value dim_names(rj::kArrayType); - for (std::string v : dim_names_) { + for (const std::string& v : dim_names_) { dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); } document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); @@ -199,10 +201,52 @@ std::shared_ptr FixedShapeTensorType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.fixed_shape_tensor", - static_cast(*data->type).extension_name()); + internal::checked_cast(*data->type).extension_name()); return std::make_shared(data); } +Result> FixedShapeTensorType::MakeTensor( + const std::shared_ptr& scalar) { + const auto ext_scalar = internal::checked_pointer_cast(scalar); + const auto ext_type = + internal::checked_pointer_cast(scalar->type); + if (!is_fixed_width(*ext_type->value_type())) { + return Status::TypeError("Cannot convert non-fixed-width values to Tensor."); + } + const auto array = + internal::checked_pointer_cast(ext_scalar->value)->value; + if (array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls to Tensor."); + } + const auto value_type = + internal::checked_pointer_cast(ext_type->value_type()); + const auto byte_width = value_type->byte_width(); + + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim()); + std::iota(permutation.begin(), permutation.end(), 0); + } + + std::vector shape = ext_type->shape(); + internal::Permute(permutation, &shape); + + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + internal::Permute(permutation, &dim_names); + } + + std::vector strides; + RETURN_NOT_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); + const auto start_position = array->offset() * byte_width; + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + const auto buffer = + SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); + + return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); +} + Result> FixedShapeTensorArray::FromTensor( const std::shared_ptr& tensor) { auto permutation = internal::ArgSort(tensor->strides(), std::greater<>()); @@ -293,53 +337,71 @@ const Result> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); - - std::vector dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); - } - dim_names.insert(dim_names.begin(), 1, ""); + const auto ext_type = + internal::checked_pointer_cast(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); + + // ext_type->permutation() gives us permutation for a single row with values in + // range [0, ndim). Here want to create a ndim + 1 dimensional tensor from the entire + // array and we assume the first dimension will always have the greatest stride, so it + // will get permutation index 0 and remaining values from ext_type->permutation() need + // to be shifted to fill the [1, ndim+1) range. Computed permutation will be used to + // generate the new tensor's shape, strides and dim_names. + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim() + 1); + std::iota(permutation.begin(), permutation.end(), 0); } else { - dim_names = {}; + for (auto i = 0; i < static_cast(ext_type->ndim()); i++) { + permutation[i] += 1; + } + permutation.insert(permutation.begin(), 1, 0); } - std::vector shape; - for (int64_t& i : permutation) { - shape.emplace_back(ext_type->shape()[i]); - ++i; + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + dim_names.insert(dim_names.begin(), 1, ""); + internal::Permute(permutation, &dim_names); } + + std::vector shape = ext_type->shape(); + auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); shape.insert(shape.begin(), 1, this->length()); - permutation.insert(permutation.begin(), 1, 0); + internal::Permute(permutation, &shape); std::vector tensor_strides; - auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); + const auto fw_value_type = internal::checked_pointer_cast(value_type); ARROW_RETURN_NOT_OK( - ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); - ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); + ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides)); + + const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); - return tensor; + const auto buffer, + SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width())); + + return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names); } Result> FixedShapeTensorType::Make( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { - if (!permutation.empty() && shape.size() != permutation.size()) { - return Status::Invalid("permutation size must match shape size. Expected: ", - shape.size(), " Got: ", permutation.size()); + const auto ndim = shape.size(); + if (!permutation.empty() && ndim != permutation.size()) { + return Status::Invalid("permutation size must match shape size. Expected: ", ndim, + " Got: ", permutation.size()); + } + if (!dim_names.empty() && ndim != dim_names.size()) { + return Status::Invalid("dim_names size must match shape size. Expected: ", ndim, + " Got: ", dim_names.size()); } - if (!dim_names.empty() && shape.size() != dim_names.size()) { - return Status::Invalid("dim_names size must match shape size. Expected: ", - shape.size(), " Got: ", dim_names.size()); + if (!permutation.empty()) { + RETURN_NOT_OK(internal::IsPermutationValid(permutation)); } + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); return std::make_shared(value_type, static_cast(size), diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index fcfb1ebbab96a..591a7cee32a34 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { std::string ToString() const override; /// Number of dimensions of tensor elements - size_t ndim() { return shape_.size(); } + size_t ndim() const { return shape_.size(); } /// Shape of tensor elements const std::vector shape() const { return shape_; } @@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; + /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray + /// + /// This method will return a Tensor from ExtensionScalar with strides + /// derived from shape and permutation of FixedShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// FixedShapeTensorType metadata. + static Result> MakeTensor( + const std::shared_ptr& scalar); + /// \brief Create a FixedShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const std::vector& shape, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2b8e703d3c66e..3fd39a11ff50d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -28,6 +28,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/sort.h" namespace arrow { @@ -39,34 +40,34 @@ class TestExtensionType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; - cell_shape_ = {3, 4}; + element_shape_ = {3, 4}; value_type_ = int64(); - cell_type_ = fixed_size_list(value_type_, 12); + element_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; ext_type_ = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); + fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; tensor_strides_ = {96, 32, 8}; - cell_strides_ = {32, 8}; + element_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; std::vector shape_partial_; - std::vector cell_shape_; + std::vector element_shape_; std::shared_ptr value_type_; - std::shared_ptr cell_type_; + std::shared_ptr element_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; std::vector tensor_strides_; - std::vector cell_strides_; + std::vector element_strides_; std::string serialized_; }; @@ -96,8 +97,8 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - ASSERT_FALSE(ext_type_->Equals(*cell_type_)); - ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); + ASSERT_FALSE(ext_type_->Equals(*element_type_)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*element_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); @@ -106,18 +107,28 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->ndim(), element_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), element_shape_); ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->strides(), cell_strides_); + ASSERT_EQ(exact_ext_type->strides(), element_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); + FixedShapeTensorType::Make(value_type_, element_shape_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); + FixedShapeTensorType::Make(value_type_, element_shape_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 2 dimensional tensors must be " + "unique and within [0, 1] range. Got: [3,0]"), + FixedShapeTensorType::Make(value_type_, {5, 6}, {3, 0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [0,1,1]"), + FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); } TEST_F(TestExtensionType, EqualsCases) { @@ -148,7 +159,7 @@ TEST_F(TestExtensionType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); @@ -200,7 +211,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array @@ -215,7 +226,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, + ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); RoundtripBatch(batch2, &read_batch2); @@ -270,7 +281,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_arr_5 = std::static_pointer_cast( ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), + TypeError, testing::HasSubstr("binary is not valid data type for a tensor"), ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( @@ -278,6 +289,10 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( arr_with_null, fixed_size_list(int64(), 2))); + + auto ext_type_7 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {})); + ASSERT_OK_AND_ASSIGN(auto ext_arr_7, FixedShapeTensorArray::FromTensor(tensor)); } void CheckFromTensorType(const std::shared_ptr& tensor, @@ -308,7 +323,7 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { @@ -316,11 +331,82 @@ TEST_F(TestExtensionType, TestFromTensorType) { strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template +void CheckToTensor(const std::vector& values, const std::shared_ptr typ, + const int32_t& element_size, const std::vector& element_shape, + const std::vector& element_permutation, + const std::vector& element_dim_names, + const std::vector& tensor_shape, + const std::vector& tensor_dim_names, + const std::vector& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr element_type = fixed_size_list(typ, element_size); + std::vector> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared(typ, values.size(), buffers); + auto arr = std::make_shared(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); + ASSERT_OK(actual_tensor->Validate()); + + ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); + ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); + ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); + ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); + ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); + ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); +} + +TEST_F(TestExtensionType, ToTensor) { + std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + + auto element_sizes = std::vector{6, 6, 18, 18, 18, 18}; + + auto element_shapes = std::vector>{{2, 3}, {3, 2}, {3, 6}, + {6, 3}, {3, 2, 3}, {3, 2, 3}}; + auto tensor_shapes = std::vector>{ + {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + + auto element_permutations = std::vector>{ + {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; + auto tensor_strides_32 = + std::vector>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, + {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; + auto tensor_strides_64 = + std::vector>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, + {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + + auto element_dim_names = std::vector>{ + {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; + auto tensor_dim_names = std::vector>{ + {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, + {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; + + for (size_t i = 0; i < element_shapes.size(); i++) { + CheckToTensor(float_values, float32(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_32[i]); + CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_64[i]); + } +} + void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); @@ -364,7 +450,7 @@ TEST_F(TestExtensionType, SliceTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); @@ -404,11 +490,11 @@ TEST_F(TestExtensionType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int32(), element_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); @@ -462,4 +548,96 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast(permuted_ext_type); + + auto array = std::static_pointer_cast( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK(actual_tensor->Validate()); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {3, 4}, {}, {"x", "y"})); + ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); + ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); + ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); + ASSERT_EQ(actual_tensor->strides(), std::vector({32, 8})); + ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); + ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); + + // Get tensor from extension array with non-trivial permutation + ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {4, 3}, {8, 24}, {"y", "x"})); + ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); + ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, + exact_permuted_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + ASSERT_OK(actual_permuted_tensor->Validate()); + ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); + ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); + ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); + ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); + ASSERT_EQ(expected_permuted_tensor->is_contiguous(), + actual_permuted_tensor->is_contiguous()); + ASSERT_EQ(expected_permuted_tensor->is_column_major(), + actual_permuted_tensor->is_column_major()); + ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); + } + + // Test null values fail + auto element_type = fixed_size_list(int64(), 1); + auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); + ext_type = fixed_shape_tensor(int64(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + auto tensor_array = internal::checked_pointer_cast(ext_arr); + + ASSERT_OK_AND_ASSIGN(auto scalar, tensor_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto tensor, + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(1)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(2)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + element_type = list(utf8()); + ext_type = fixed_shape_tensor(utf8(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + scalar = std::make_shared(ArrayFromJSON(element_type, R"([["a", "b"]])")); + auto ext_scalar = std::make_shared(scalar, ext_type); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + testing::HasSubstr("Type error: Cannot convert non-fixed-width values to Tensor."), + exact_ext_type->MakeTensor(ext_scalar)); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h new file mode 100644 index 0000000000000..069880cb17c85 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/print.h" + +namespace arrow::internal { + +ARROW_EXPORT +Status IsPermutationValid(const std::vector& permutation) { + const auto size = static_cast(permutation.size()); + std::vector dim_seen(size, 0); + + for (const auto p : permutation) { + if (p < 0 || p >= size || dim_seen[p] != 0) { + return Status::Invalid( + "Permutation indices for ", size, + " dimensional tensors must be unique and within [0, ", size - 1, + "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); + } + dim_seen[p] = 1; + } + return Status::OK(); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a5179c22190e1..11750591932e9 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include +#include +#include +#include + #include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs_internal.h" @@ -283,18 +288,26 @@ struct AzureLocation { template Status ExceptionToStatus(const Storage::StorageException& exception, PrefixArgs&&... prefix_args) { - return Status::IOError(std::forward(prefix_args)..., - " Azure Error: ", exception.what()); + return Status::IOError(std::forward(prefix_args)..., " Azure Error: [", + exception.ErrorCode, "] ", exception.what()); } Status PathNotFound(const AzureLocation& location) { return ::arrow::fs::internal::PathNotFound(location.all); } +Status NotADir(const AzureLocation& location) { + return ::arrow::fs::internal::NotADir(location.all); +} + Status NotAFile(const AzureLocation& location) { return ::arrow::fs::internal::NotAFile(location.all); } +Status NotEmpty(const AzureLocation& location) { + return ::arrow::fs::internal::NotEmpty(location.all); +} + Status ValidateFileLocation(const AzureLocation& location) { if (location.container.empty()) { return PathNotFound(location); @@ -305,6 +318,23 @@ Status ValidateFileLocation(const AzureLocation& location) { return internal::AssertNoTrailingSlash(location.path); } +Status InvalidDirMoveToSubdir(const AzureLocation& src, const AzureLocation& dest) { + return Status::Invalid("Cannot Move to '", dest.all, "' and make '", src.all, + "' a sub-directory of itself."); +} + +Status DestinationParentPathNotFound(const AzureLocation& dest) { + return Status::IOError("The parent directory of the destination path '", dest.all, + "' does not exist."); +} + +Status CrossContainerMoveNotImplemented(const AzureLocation& src, + const AzureLocation& dest) { + return Status::NotImplemented( + "Move of '", src.all, "' to '", dest.all, + "' requires moving data between containers, which is not implemented."); +} + bool IsContainerNotFound(const Storage::StorageException& e) { // In some situations, only the ReasonPhrase is set and the // ErrorCode is empty, so we check both. @@ -942,6 +972,185 @@ FileInfo FileInfoFromBlob(std::string_view container, return info; } +/// \brief RAII-style guard for releasing a lease on a blob or container. +/// +/// The guard should be constructed right after a successful BlobLeaseClient::Acquire() +/// call. Use std::optional to declare a guard in outer scope and construct it +/// later with std::optional::emplace(...). +/// +/// Leases expire automatically, but explicit release means concurrent clients or +/// ourselves when trying new operations on the same blob or container don't have +/// to wait for the lease to expire by itself. +/// +/// Learn more about leases at +/// https://learn.microsoft.com/en-us/rest/api/storageservices/lease-blob +class LeaseGuard { + public: + using SteadyClock = std::chrono::steady_clock; + + private: + /// \brief The time when the lease expires or is broken. + /// + /// The lease is not guaranteed to be valid until this time, but it is guaranteed to + /// be expired after this time. In other words, this is an overestimation of + /// the true time_point. + SteadyClock::time_point break_or_expires_at_; + const std::unique_ptr lease_client_; + bool release_attempt_pending_ = true; + + /// \brief The latest known expiry time of a lease guarded by this class + /// that failed to be released or was forgotten by calling Forget(). + static std::atomic latest_known_expiry_time_; + + /// \brief The maximum lease duration supported by Azure Storage. + static constexpr std::chrono::seconds kMaxLeaseDuration{60}; + + public: + LeaseGuard(std::unique_ptr lease_client, + std::chrono::seconds lease_duration) + : break_or_expires_at_(SteadyClock::now() + + std::min(kMaxLeaseDuration, lease_duration)), + lease_client_(std::move(lease_client)) { + DCHECK(lease_duration <= kMaxLeaseDuration); + DCHECK(this->lease_client_); + } + + ARROW_DISALLOW_COPY_AND_ASSIGN(LeaseGuard); + + ~LeaseGuard() { + // No point in trying any error handling here other than the debug checking. The lease + // will eventually expire on the backend without any intervention from us (just much + // later than if we released it). + [[maybe_unused]] auto status = Release(); + ARROW_LOG(DEBUG) << status; + } + + bool PendingRelease() const { + return release_attempt_pending_ && SteadyClock::now() <= break_or_expires_at_; + } + + private: + Status DoRelease() { + DCHECK(release_attempt_pending_); + try { + lease_client_->Release(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to release the ", + lease_client_->GetLeaseId(), " lease"); + } + return Status::OK(); + } + + public: + std::string LeaseId() const { return lease_client_->GetLeaseId(); } + + bool StillValidFor(SteadyClock::duration expected_time_left) const { + return SteadyClock::now() + expected_time_left < break_or_expires_at_; + } + + /// \brief Break the lease. + /// + /// The lease will stay in the "Breaking" state for break_period seconds or + /// less if the lease is expiring before that. + /// + /// https://learn.microsoft.com/en-us/rest/api/storageservices/lease-container#outcomes-of-use-attempts-on-containers-by-lease-state + /// https://learn.microsoft.com/en-us/rest/api/storageservices/lease-blob#outcomes-of-use-attempts-on-blobs-by-lease-state + Status Break(Azure::Nullable break_period = {}) { + auto remaining_time_ms = [this]() { + const auto remaining_time = break_or_expires_at_ - SteadyClock::now(); + return std::chrono::duration_cast(remaining_time); + }; +#ifndef NDEBUG + if (break_period.HasValue() && !StillValidFor(*break_period)) { + ARROW_LOG(WARNING) + << "Azure Storage: requested break_period (" + << break_period.ValueOr(std::chrono::seconds{0}).count() + << "s) is too long or lease duration is too short for all the operations " + "performed so far (lease expires in " + << remaining_time_ms().count() << "ms)"; + } +#endif + Blobs::BreakLeaseOptions options; + options.BreakPeriod = break_period; + try { + lease_client_->Break(options); + break_or_expires_at_ = + std::min(break_or_expires_at_, + SteadyClock::now() + break_period.ValueOr(std::chrono::seconds{0})); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to break the ", + lease_client_->GetLeaseId(), " lease expiring in ", + remaining_time_ms().count(), "ms"); + } + return Status::OK(); + } + + /// \brief Break the lease before deleting or renaming the resource. + /// + /// Calling this is recommended when the resource for which the lease was acquired is + /// about to be deleted as there is no way of releasing the lease after that, we can + /// only forget about it. The break_period should be a conservative estimate of the time + /// it takes to delete/rename the resource. + /// + /// If break_period is too small, the delete/rename will fail with a lease conflict, + /// and if it's too large the only consequence is that a lease on a non-existent + /// resource will remain in the "Breaking" state for a while blocking others + /// from recreating the resource. + void BreakBeforeDeletion(std::chrono::seconds break_period) { + ARROW_CHECK_OK(Break(break_period)); + } + + // These functions are marked ARROW_NOINLINE because they are called from + // multiple locations, but are not performance-critical. + + ARROW_NOINLINE Status Release() { + if (!PendingRelease()) { + return Status::OK(); + } + auto status = DoRelease(); + if (!status.ok()) { + Forget(); + return status; + } + release_attempt_pending_ = false; + return Status::OK(); + } + + /// \brief Prevent any release attempts in the destructor. + /// + /// When it's known they would certainly fail. + /// \see LeaseGuard::BreakBeforeDeletion() + ARROW_NOINLINE void Forget() { + if (!PendingRelease()) { + release_attempt_pending_ = false; + return; + } + release_attempt_pending_ = false; + // Remember the latest known expiry time so we can gracefully handle lease + // acquisition failures by waiting until the latest forgotten lease. + auto latest = latest_known_expiry_time_.load(std::memory_order_relaxed); + while ( + latest < break_or_expires_at_ && + !latest_known_expiry_time_.compare_exchange_weak(latest, break_or_expires_at_)) { + } + DCHECK_GE(latest_known_expiry_time_.load(), break_or_expires_at_); + } + + ARROW_NOINLINE static void WaitUntilLatestKnownExpiryTime() { + auto remaining_time = latest_known_expiry_time_.load() - SteadyClock::now(); +#ifndef NDEBUG + int64_t remaining_time_ms = + std::chrono::duration_cast(remaining_time).count(); + ARROW_LOG(WARNING) << "LeaseGuard::WaitUntilLatestKnownExpiryTime for " + << remaining_time_ms << "ms..."; +#endif + DCHECK(remaining_time <= kMaxLeaseDuration); + if (remaining_time > SteadyClock::duration::zero()) { + std::this_thread::sleep_for(remaining_time); + } + } +}; + } // namespace class AzureFileSystem::Impl { @@ -975,6 +1184,11 @@ class AzureFileSystem::Impl { return blob_service_client_->GetBlobContainerClient(container_name); } + Blobs::BlobClient GetBlobClient(const std::string& container_name, + const std::string& blob_name) { + return GetBlobContainerClient(container_name).GetBlobClient(blob_name); + } + /// \param container_name Also known as "filesystem" in the ADLS Gen2 API. DataLake::DataLakeFileSystemClient GetFileSystemClient( const std::string& container_name) { @@ -1030,11 +1244,14 @@ class AzureFileSystem::Impl { /// \pre location.path is not empty. Result GetFileInfo(const DataLake::DataLakeFileSystemClient& adlfs_client, - const AzureLocation& location) { + const AzureLocation& location, + Azure::Nullable lease_id = {}) { auto file_client = adlfs_client.GetFileClient(location.path); + DataLake::GetPathPropertiesOptions options; + options.AccessConditions.LeaseId = std::move(lease_id); try { FileInfo info{location.all}; - auto properties = file_client.GetProperties(); + auto properties = file_client.GetProperties(options); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else if (internal::HasTrailingSlash(location.path)) { @@ -1115,6 +1332,22 @@ class AzureFileSystem::Impl { } } + Result GetFileInfoOfPathWithinContainer(const AzureLocation& location) { + DCHECK(!location.container.empty() && !location.path.empty()); + // There is a path to search within the container. Check HNS support to proceed. + auto adlfs_client = GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return FileInfo{location.all, FileType::NotFound}; + } + if (hns_support == HNSSupport::kEnabled) { + return GetFileInfo(adlfs_client, location); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + auto container_client = GetBlobContainerClient(location.container); + return GetFileInfo(container_client, location); + } + private: /// \pref location.container is not empty. template @@ -1320,8 +1553,7 @@ class AzureFileSystem::Impl { AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); auto blob_client = std::make_shared( - blob_service_client_->GetBlobContainerClient(location.container) - .GetBlobClient(location.path)); + GetBlobClient(location.container, location.path)); auto ptr = std::make_shared(blob_client, fs->io_context(), std::move(location)); @@ -1340,8 +1572,7 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(info.path())); RETURN_NOT_OK(ValidateFileLocation(location)); auto blob_client = std::make_shared( - blob_service_client_->GetBlobContainerClient(location.container) - .GetBlobClient(location.path)); + GetBlobClient(location.container, location.path)); auto ptr = std::make_shared(blob_client, fs->io_context(), std::move(location), info.size()); @@ -1404,7 +1635,8 @@ class AzureFileSystem::Impl { return CreateDirTemplate( adlfs_client, [](const auto& adlfs_client, const auto& location) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient( + std::string(internal::RemoveTrailingSlash(location.path))); directory_client.CreateIfNotExists(); }, location, recursive); @@ -1625,13 +1857,17 @@ class AzureFileSystem::Impl { /// \pre location.path is not empty. Status DeleteDirOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureLocation& location, bool recursive, - bool require_dir_to_exist) { + bool require_dir_to_exist, + Azure::Nullable lease_id = {}) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - auto directory_client = adlfs_client.GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient( + std::string(internal::RemoveTrailingSlash(location.path))); + DataLake::DeleteDirectoryOptions options; + options.AccessConditions.LeaseId = std::move(lease_id); try { - auto response = - recursive ? directory_client.DeleteRecursive() : directory_client.DeleteEmpty(); + auto response = recursive ? directory_client.DeleteRecursive(options) + : directory_client.DeleteEmpty(options); // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); @@ -1690,17 +1926,460 @@ class AzureFileSystem::Impl { } } + Status DeleteFile(const AzureLocation& location) { + RETURN_NOT_OK(ValidateFileLocation(location)); + auto file_client = datalake_service_client_->GetFileSystemClient(location.container) + .GetFileClient(location.path); + try { + auto response = file_client.Delete(); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.ErrorCode == "FilesystemNotFound" || + exception.ErrorCode == "PathNotFound") { + return PathNotFound(location); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, + ": ", file_client.GetUrl()); + } + return Status::OK(); + } + + private: + /// \brief Create a BlobLeaseClient and acquire a lease on the container. + /// + /// \param allow_missing_container if true, a nullptr may be returned when the container + /// doesn't exist, otherwise a PathNotFound(location) error is produced right away + /// \return A BlobLeaseClient is wrapped as a unique_ptr so it's moveable and + /// optional (nullptr denotes container not found) + Result> AcquireContainerLease( + const AzureLocation& location, std::chrono::seconds lease_duration, + bool allow_missing_container = false, bool retry_allowed = true) { + DCHECK(!location.container.empty()); + auto container_client = GetBlobContainerClient(location.container); + auto lease_id = Blobs::BlobLeaseClient::CreateUniqueLeaseId(); + auto container_url = container_client.GetUrl(); + auto lease_client = std::make_unique( + std::move(container_client), std::move(lease_id)); + try { + [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); + DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + if (allow_missing_container) { + return nullptr; + } + return PathNotFound(location); + } else if (exception.StatusCode == Http::HttpStatusCode::Conflict && + exception.ErrorCode == "LeaseAlreadyPresent") { + if (retry_allowed) { + LeaseGuard::WaitUntilLatestKnownExpiryTime(); + return AcquireContainerLease(location, lease_duration, allow_missing_container, + /*retry_allowed=*/false); + } + } + return ExceptionToStatus(exception, "Failed to acquire a lease on container '", + location.container, "': ", container_url); + } + return lease_client; + } + + /// \brief Create a BlobLeaseClient and acquire a lease on a blob/file (or + /// directory if Hierarchical Namespace is supported). + /// + /// \param allow_missing if true, a nullptr may be returned when the blob + /// doesn't exist, otherwise a PathNotFound(location) error is produced right away + /// \return A BlobLeaseClient is wrapped as a unique_ptr so it's moveable and + /// optional (nullptr denotes blob not found) + Result> AcquireBlobLease( + const AzureLocation& location, std::chrono::seconds lease_duration, + bool allow_missing = false, bool retry_allowed = true) { + DCHECK(!location.container.empty() && !location.path.empty()); + auto path = std::string{internal::RemoveTrailingSlash(location.path)}; + auto blob_client = GetBlobClient(location.container, std::move(path)); + auto lease_id = Blobs::BlobLeaseClient::CreateUniqueLeaseId(); + auto blob_url = blob_client.GetUrl(); + auto lease_client = std::make_unique(std::move(blob_client), + std::move(lease_id)); + try { + [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); + DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + if (allow_missing) { + return nullptr; + } + return PathNotFound(location); + } else if (exception.StatusCode == Http::HttpStatusCode::Conflict && + exception.ErrorCode == "LeaseAlreadyPresent") { + if (retry_allowed) { + LeaseGuard::WaitUntilLatestKnownExpiryTime(); + return AcquireBlobLease(location, lease_duration, allow_missing, + /*retry_allowed=*/false); + } + } + return ExceptionToStatus(exception, "Failed to acquire a lease on file '", + location.all, "': ", blob_url); + } + return lease_client; + } + + /// \brief The default lease duration used for acquiring a lease on a container or blob. + /// + /// Azure Storage leases can be acquired for a duration of 15 to 60 seconds. + /// + /// Operations consisting of an unpredictable number of sub-operations should + /// renew the lease periodically (heartbeat pattern) instead of acquiring an + /// infinite lease (very bad idea for a library like Arrow). + static constexpr auto kLeaseDuration = std::chrono::seconds{15}; + + // These are conservative estimates of how long it takes for the client + // request to reach the server counting from the moment the Azure SDK function + // issuing the request is called. See their usage for more context. + // + // If the client connection to the server is unpredictably slow, operations + // may fail, but due to the use of leases, the entire arrow::FileSystem + // operation can be retried without risk of data loss. Thus, unexpected network + // slow downs can be fixed with retries (either by some system using Arrow or + // an user interactively retrying a failed operation). + // + // If a network is permanently slow, the lease time and these numbers should be + // increased but not so much so that the client gives up an operation because the + // values say it takes more time to reach the server than the remaining lease + // time on the resources. + // + // NOTE: The initial constant values were chosen conservatively. If we learn, + // from experience, that they are causing issues, we can increase them. And if + // broadly applicable values aren't possible, we can make them configurable. + static constexpr auto kTimeNeededForContainerDeletion = std::chrono::seconds{3}; + static constexpr auto kTimeNeededForContainerRename = std::chrono::seconds{3}; + static constexpr auto kTimeNeededForFileOrDirectoryRename = std::chrono::seconds{3}; + + public: + /// The conditions for a successful container rename are derived from the + /// conditions for a successful `Move("/$src.container", "/$dest.container")`. + /// The numbers here match the list in `Move`. + /// + /// 1. `src.container` must exist. + /// 2. If `src.container` and `dest.container` are the same, do nothing and + /// return OK. + /// 3. N/A. + /// 4. N/A. + /// 5. `dest.container` doesn't exist or is empty. + /// NOTE: one exception related to container Move is that when the + /// destination is empty we also require the source container to be empty, + /// because the only way to perform the "Move" is by deleting the source + /// instead of deleting the destination and renaming the source. + Status RenameContainer(const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && src.path.empty()); + DCHECK(!dest.container.empty() && dest.path.empty()); + auto src_container_client = GetBlobContainerClient(src.container); + + // If src and dest are the same, we only have to check src exists. + if (src.container == dest.container) { + ARROW_ASSIGN_OR_RAISE(auto info, + GetContainerPropsAsFileInfo(src, src_container_client)); + if (info.type() == FileType::NotFound) { + return PathNotFound(src); + } + DCHECK(info.type() == FileType::Directory); + return Status::OK(); + } + + // Acquire a lease on the source container because (1) we need the lease + // before rename and (2) it works as a way of checking the container exists. + ARROW_ASSIGN_OR_RAISE(auto src_lease_client, + AcquireContainerLease(src, kLeaseDuration)); + LeaseGuard src_lease_guard{std::move(src_lease_client), kLeaseDuration}; + // Check dest.container doesn't exist or is empty. + auto dest_container_client = GetBlobContainerClient(dest.container); + std::optional dest_lease_guard; + bool dest_exists = false; + bool dest_is_empty = false; + ARROW_ASSIGN_OR_RAISE( + auto dest_lease_client, + AcquireContainerLease(dest, kLeaseDuration, /*allow_missing_container*/ true)); + if (dest_lease_client) { + dest_lease_guard.emplace(std::move(dest_lease_client), kLeaseDuration); + dest_exists = true; + // Emptiness check after successful acquisition of the lease. + Blobs::ListBlobsOptions list_blobs_options; + list_blobs_options.PageSizeHint = 1; + try { + auto dest_list_response = dest_container_client.ListBlobs(list_blobs_options); + dest_is_empty = dest_list_response.Blobs.empty(); + if (!dest_is_empty) { + return NotEmpty(dest); + } + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to check that '", dest.container, + "' is empty: ", dest_container_client.GetUrl()); + } + } + DCHECK(!dest_exists || dest_is_empty); + + if (!dest_exists) { + // Rename the source container. + Blobs::RenameBlobContainerOptions options; + options.SourceAccessConditions.LeaseId = src_lease_guard.LeaseId(); + try { + src_lease_guard.BreakBeforeDeletion(kTimeNeededForContainerRename); + blob_service_client_->RenameBlobContainer(src.container, dest.container, options); + src_lease_guard.Forget(); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::BadRequest && + exception.ErrorCode == "InvalidQueryParameterValue") { + auto param_name = exception.AdditionalInformation.find("QueryParameterName"); + if (param_name != exception.AdditionalInformation.end() && + param_name->second == "comp") { + return ExceptionToStatus( + exception, "The 'rename' operation is not supported on containers. ", + "Attempting a rename of '", src.container, "' to '", dest.container, + "': ", blob_service_client_->GetUrl()); + } + } + return ExceptionToStatus(exception, "Failed to rename container '", src.container, + "' to '", dest.container, + "': ", blob_service_client_->GetUrl()); + } + } else if (dest_is_empty) { + // Even if we deleted the empty dest.container, RenameBlobContainer() would still + // fail because containers are not immediately deleted by the service -- they are + // deleted asynchronously based on retention policies and non-deterministic behavior + // of the garbage collection process. + // + // One way to still match POSIX rename semantics is to delete the src.container if + // and only if it's empty because the final state would be equivalent to replacing + // the dest.container with the src.container and its contents (which happens + // to also be empty). + Blobs::ListBlobsOptions list_blobs_options; + list_blobs_options.PageSizeHint = 1; + try { + auto src_list_response = src_container_client.ListBlobs(list_blobs_options); + if (!src_list_response.Blobs.empty()) { + // Reminder: dest is used here because we're semantically replacing dest + // with src. By deleting src if it's empty just like dest. + return Status::IOError("Unable to replace empty container: '", dest.all, "'"); + } + // Delete the source container now that we know it's empty. + Blobs::DeleteBlobContainerOptions options; + options.AccessConditions.LeaseId = src_lease_guard.LeaseId(); + DCHECK(dest_lease_guard.has_value()); + // Make sure lease on dest is still valid before deleting src. This is to ensure + // the destination container is not deleted by another process/client before + // Move() returns. + if (!dest_lease_guard->StillValidFor(kTimeNeededForContainerDeletion)) { + return Status::IOError("Unable to replace empty container: '", dest.all, "'. ", + "Preparation for the operation took too long and a " + "container lease expired."); + } + try { + src_lease_guard.BreakBeforeDeletion(kTimeNeededForContainerDeletion); + src_container_client.Delete(options); + src_lease_guard.Forget(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to delete empty container: '", + src.container, "': ", src_container_client.GetUrl()); + } + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Unable to replace empty container: '", + dest.all, "': ", dest_container_client.GetUrl()); + } + } + return Status::OK(); + } + + Status MoveContainerToPath(const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && src.path.empty()); + DCHECK(!dest.container.empty() && !dest.path.empty()); + // Check Move pre-condition 1 -- `src` must exist. + auto container_client = GetBlobContainerClient(src.container); + ARROW_ASSIGN_OR_RAISE(auto src_info, + GetContainerPropsAsFileInfo(src, container_client)); + if (src_info.type() == FileType::NotFound) { + return PathNotFound(src); + } + // Check Move pre-condition 2. + if (src.container == dest.container) { + return InvalidDirMoveToSubdir(src, dest); + } + // Instead of checking more pre-conditions, we just abort with a + // NotImplemented status. + return CrossContainerMoveNotImplemented(src, dest); + } + + Status CreateContainerFromPath(const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && !src.path.empty()); + DCHECK(!dest.empty() && dest.path.empty()); + ARROW_ASSIGN_OR_RAISE(auto src_file_info, GetFileInfoOfPathWithinContainer(src)); + switch (src_file_info.type()) { + case FileType::Unknown: + case FileType::NotFound: + return PathNotFound(src); + case FileType::File: + return Status::Invalid( + "Creating files at '/' is not possible, only directories."); + case FileType::Directory: + break; + } + if (src.container == dest.container) { + return InvalidDirMoveToSubdir(src, dest); + } + return CrossContainerMoveNotImplemented(src, dest); + } + + Status MovePathWithDataLakeAPI( + const DataLake::DataLakeFileSystemClient& src_adlfs_client, + const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && !src.path.empty()); + DCHECK(!dest.container.empty() && !dest.path.empty()); + const auto src_path = std::string{internal::RemoveTrailingSlash(src.path)}; + const auto dest_path = std::string{internal::RemoveTrailingSlash(dest.path)}; + + // Ensure that src exists and, if path has a trailing slash, that it's a directory. + ARROW_ASSIGN_OR_RAISE(auto src_lease_client, AcquireBlobLease(src, kLeaseDuration)); + LeaseGuard src_lease_guard{std::move(src_lease_client), kLeaseDuration}; + // It might be necessary to check src is a directory 0-3 times in this function, + // so we use a lazy evaluation function to avoid redundant calls to GetFileInfo(). + std::optional src_is_dir_opt{}; + auto src_is_dir_lazy = [&]() -> Result { + if (src_is_dir_opt.has_value()) { + return *src_is_dir_opt; + } + ARROW_ASSIGN_OR_RAISE( + auto src_info, GetFileInfo(src_adlfs_client, src, src_lease_guard.LeaseId())); + src_is_dir_opt = src_info.type() == FileType::Directory; + return *src_is_dir_opt; + }; + // src must be a directory if it has a trailing slash. + if (internal::HasTrailingSlash(src.path)) { + ARROW_ASSIGN_OR_RAISE(auto src_is_dir, src_is_dir_lazy()); + if (!src_is_dir) { + return NotADir(src); + } + } + // The Azure SDK and the backend don't perform many important checks, so we have to + // do them ourselves. Additionally, based on many tests on a default-configuration + // storage account, if the destination is an empty directory, the rename operation + // will most likely fail due to a timeout. Providing both leases -- to source and + // destination -- seems to have made things work. + ARROW_ASSIGN_OR_RAISE(auto dest_lease_client, + AcquireBlobLease(dest, kLeaseDuration, /*allow_missing=*/true)); + std::optional dest_lease_guard; + if (dest_lease_client) { + dest_lease_guard.emplace(std::move(dest_lease_client), kLeaseDuration); + // Perform all the checks on dest (and src) before proceeding with the rename. + auto dest_adlfs_client = GetFileSystemClient(dest.container); + ARROW_ASSIGN_OR_RAISE(auto dest_info, GetFileInfo(dest_adlfs_client, dest, + dest_lease_guard->LeaseId())); + if (dest_info.type() == FileType::Directory) { + ARROW_ASSIGN_OR_RAISE(auto src_is_dir, src_is_dir_lazy()); + if (!src_is_dir) { + // If src is a regular file, complain that dest is a directory + // like POSIX rename() does. + return internal::IsADir(dest.all); + } + } else { + if (internal::HasTrailingSlash(dest.path)) { + return NotADir(dest); + } + } + } else { + // If the destination has trailing slash, we would have to check that it's a + // directory, but since it doesn't exist we must return PathNotFound... + if (internal::HasTrailingSlash(dest.path)) { + // ...unless the src is a directory, in which case we can proceed with the rename. + ARROW_ASSIGN_OR_RAISE(auto src_is_dir, src_is_dir_lazy()); + if (!src_is_dir) { + return PathNotFound(dest); + } + } + } + + try { + // NOTE: The Azure SDK provides a RenameDirectory() function, but the + // implementation is the same as RenameFile() with the only difference being + // the type of the returned object (DataLakeDirectoryClient vs DataLakeFileClient). + // + // If we call RenameDirectory() and the source is a file, no error would + // be returned and the file would be renamed just fine (!). + // + // Since we don't use the returned object, we can just use RenameFile() for both + // files and directories. Ideally, the SDK would simply expose the PathClient + // that it uses internally for both files and directories. + DataLake::RenameFileOptions options; + options.DestinationFileSystem = dest.container; + options.SourceAccessConditions.LeaseId = src_lease_guard.LeaseId(); + if (dest_lease_guard.has_value()) { + options.AccessConditions.LeaseId = dest_lease_guard->LeaseId(); + } + src_lease_guard.BreakBeforeDeletion(kTimeNeededForFileOrDirectoryRename); + src_adlfs_client.RenameFile(src_path, dest_path, options); + src_lease_guard.Forget(); + } catch (const Storage::StorageException& exception) { + // https://learn.microsoft.com/en-gb/rest/api/storageservices/datalakestoragegen2/path/create + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + if (exception.ErrorCode == "PathNotFound") { + return PathNotFound(src); + } + // "FilesystemNotFound" could be triggered by the source or destination filesystem + // not existing, but since we already checked the source filesystem exists (and + // hold a lease to it), we can assume the destination filesystem is the one the + // doesn't exist. + if (exception.ErrorCode == "FilesystemNotFound" || + exception.ErrorCode == "RenameDestinationParentPathNotFound") { + return DestinationParentPathNotFound(dest); + } + } else if (exception.StatusCode == Http::HttpStatusCode::Conflict && + exception.ErrorCode == "PathAlreadyExists") { + // "PathAlreadyExists" is only produced when the destination exists and is a + // non-empty directory, so we produce the appropriate error. + return NotEmpty(dest); + } + return ExceptionToStatus(exception, "Failed to rename '", src.all, "' to '", + dest.all, "': ", src_adlfs_client.GetUrl()); + } + return Status::OK(); + } + + Status MovePathUsingBlobsAPI(const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && !src.path.empty()); + DCHECK(!dest.container.empty() && !dest.path.empty()); + if (src.container != dest.container) { + ARROW_ASSIGN_OR_RAISE(auto src_file_info, GetFileInfoOfPathWithinContainer(src)); + if (src_file_info.type() == FileType::NotFound) { + return PathNotFound(src); + } + return CrossContainerMoveNotImplemented(src, dest); + } + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + } + + Status MovePath(const AzureLocation& src, const AzureLocation& dest) { + DCHECK(!src.container.empty() && !src.path.empty()); + DCHECK(!dest.container.empty() && !dest.path.empty()); + auto src_adlfs_client = GetFileSystemClient(src.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + HierarchicalNamespaceSupport(src_adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(src); + } + if (hns_support == HNSSupport::kEnabled) { + return MovePathWithDataLakeAPI(src_adlfs_client, src, dest); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + return MovePathUsingBlobsAPI(src, dest); + } + Status CopyFile(const AzureLocation& src, const AzureLocation& dest) { RETURN_NOT_OK(ValidateFileLocation(src)); RETURN_NOT_OK(ValidateFileLocation(dest)); if (src == dest) { return Status::OK(); } - auto dest_blob_client = blob_service_client_->GetBlobContainerClient(dest.container) - .GetBlobClient(dest.path); - auto src_url = blob_service_client_->GetBlobContainerClient(src.container) - .GetBlobClient(src.path) - .GetUrl(); + auto dest_blob_client = GetBlobClient(dest.container, dest.path); + auto src_url = GetBlobClient(src.container, src.path).GetUrl(); try { dest_blob_client.CopyFromUri(src_url); } catch (const Storage::StorageException& exception) { @@ -1711,6 +2390,9 @@ class AzureFileSystem::Impl { } }; +std::atomic LeaseGuard::latest_known_expiry_time_ = + SteadyClock::time_point{SteadyClock::duration::zero()}; + AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) : FileSystem(impl->io_context()), impl_(std::move(impl)) { default_async_is_sync_ = false; @@ -1752,19 +2434,7 @@ Result AzureFileSystem::GetFileInfo(const std::string& path) { auto container_client = impl_->GetBlobContainerClient(location.container); return GetContainerPropsAsFileInfo(location, container_client); } - // There is a path to search within the container. Check HNS support to proceed. - auto adlfs_client = impl_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, - impl_->HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return FileInfo{location.all, FileType::NotFound}; - } - if (hns_support == HNSSupport::kEnabled) { - return impl_->GetFileInfo(adlfs_client, location); - } - DCHECK_EQ(hns_support, HNSSupport::kDisabled); - auto container_client = impl_->GetBlobContainerClient(location.container); - return impl_->GetFileInfo(container_client, location); + return impl_->GetFileInfoOfPathWithinContainer(location); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { @@ -1875,11 +2545,29 @@ Status AzureFileSystem::DeleteRootDirContents() { } Status AzureFileSystem::DeleteFile(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->DeleteFile(location); } Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto src_location, AzureLocation::FromString(src)); + ARROW_ASSIGN_OR_RAISE(auto dest_location, AzureLocation::FromString(dest)); + if (src_location.container.empty()) { + return Status::Invalid("Move requires a non-empty source path."); + } + if (dest_location.container.empty()) { + return Status::Invalid("Move requires a non-empty destination path."); + } + if (src_location.path.empty()) { + if (dest_location.path.empty()) { + return impl_->RenameContainer(src_location, dest_location); + } + return impl_->MoveContainerToPath(src_location, dest_location); + } + if (dest_location.path.empty()) { + return impl_->CreateContainerFromPath(src_location, dest_location); + } + return impl_->MovePath(src_location, dest_location); } Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 55f89ba4776e2..2a131e40c05bf 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -210,6 +210,25 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { Status DeleteFile(const std::string& path) override; + /// \brief Move / rename a file or directory. + /// + /// There are no files immediately at the root directory, so paths like + /// "/segment" always refer to a container of the storage account and are + /// treated as directories. + /// + /// If `dest` exists but the operation fails for some reason, `Move` + /// guarantees `dest` is not lost. + /// + /// Conditions for a successful move: + /// 1. `src` must exist. + /// 2. `dest` can't contain a strict path prefix of `src`. More generally, + /// a directory can't be made a subdirectory of itself. + /// 3. If `dest` already exists and it's a file, `src` must also be a file. + /// `dest` is then replaced by `src`. + /// 4. All components of `dest` must exist, except for the last. + /// 5. If `dest` already exists and it's a directory, `src` must also be a + /// directory and `dest` must be empty. `dest` is then replaced by `src` + /// and its contents. Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6104b04411b32..42f38f1ed6ac7 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -50,12 +50,15 @@ #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" #include "arrow/result.h" +#include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/pcg_random.h" #include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/util/value_parsing.h" namespace arrow { @@ -335,7 +338,7 @@ TEST(AzureFileSystem, OptionsCompare) { struct PreexistingData { public: - using RNG = std::mt19937_64; + using RNG = random::pcg32_fast; public: const std::string container_name; @@ -354,7 +357,10 @@ culpa qui officia deserunt mollit anim id est laborum. explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} // Creates a path by concatenating the container name and the stem. - std::string ContainerPath(std::string_view stem) const { + std::string ContainerPath(std::string_view stem) const { return Path(stem); } + + // Short alias to ContainerPath() + std::string Path(std::string_view stem) const { return ConcatAbstractPath(container_name, stem); } @@ -391,7 +397,7 @@ culpa qui officia deserunt mollit anim id est laborum. class TestAzureFileSystem : public ::testing::Test { protected: // Set in constructor - std::mt19937_64 rng_; + random::pcg32_fast rng_; // Set in SetUp() int64_t debug_log_start_ = 0; @@ -477,18 +483,41 @@ class TestAzureFileSystem : public ::testing::Test { Blobs::BlobContainerClient CreateContainer(const std::string& name) { auto container_client = blob_service_client_->GetBlobContainerClient(name); - (void)container_client.CreateIfNotExists(); + ARROW_UNUSED(container_client.CreateIfNotExists()); return container_client; } + DataLake::DataLakeFileSystemClient CreateFilesystem(const std::string& name) { + auto adlfs_client = datalake_service_client_->GetFileSystemClient(name); + ARROW_UNUSED(adlfs_client.CreateIfNotExists()); + return adlfs_client; + } + Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client, const std::string& name, const std::string& data = "") { auto blob_client = container_client.GetBlockBlobClient(name); - (void)blob_client.UploadFrom(reinterpret_cast(data.data()), - data.size()); + ARROW_UNUSED(blob_client.UploadFrom(reinterpret_cast(data.data()), + data.size())); return blob_client; } + DataLake::DataLakeFileClient CreateFile( + DataLake::DataLakeFileSystemClient& filesystem_client, const std::string& name, + const std::string& data = "") { + auto file_client = filesystem_client.GetFileClient(name); + ARROW_UNUSED(file_client.UploadFrom(reinterpret_cast(data.data()), + data.size())); + return file_client; + } + + DataLake::DataLakeDirectoryClient CreateDirectory( + DataLake::DataLakeFileSystemClient& adlfs_client, const std::string& name) { + EXPECT_TRUE(WithHierarchicalNamespace()); + auto dir_client = adlfs_client.GetDirectoryClient(name); + dir_client.Create(); + return dir_client; + } + Blobs::Models::BlobProperties GetBlobProperties(const std::string& container_name, const std::string& blob_name) { return blob_service_client_->GetBlobContainerClient(container_name) @@ -507,8 +536,13 @@ class TestAzureFileSystem : public ::testing::Test { PreexistingData SetUpPreexistingData() { PreexistingData data(rng_); - auto container_client = CreateContainer(data.container_name); - CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + if (WithHierarchicalNamespace()) { + auto filesystem_client = CreateFilesystem(data.container_name); + CreateFile(filesystem_client, data.kObjectName, PreexistingData::kLoremIpsum); + } else { + auto container_client = CreateContainer(data.container_name); + CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + } return data; } @@ -664,6 +698,14 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), dir1, FileType::Directory); } + void TestCreateDirOnRootWithTrailingSlash() { + auto dir1 = PreexistingData::RandomContainerName(rng_) + "/"; + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + void TestCreateDirOnExistingContainer() { auto data = SetUpPreexistingData(); auto dir1 = data.RandomDirectoryPath(rng_); @@ -724,6 +766,15 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), subdir5, FileType::Directory); } + void TestCreateDirOnExistingContainerWithTrailingSlash() { + auto data = SetUpPreexistingData(); + auto dir1 = data.RandomDirectoryPath(rng_) + "/"; + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + void TestCreateDirOnMissingContainer() { auto container1 = PreexistingData::RandomContainerName(rng_); auto container2 = PreexistingData::RandomContainerName(rng_); @@ -810,6 +861,21 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), blob_path, FileType::NotFound); } + void TestNonEmptyDirWithTrailingSlash() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write("hello")); + ASSERT_OK(output->Close()); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path + "/")); + AssertFileInfo(fs(), blob_path, FileType::NotFound); + } + void TestDeleteDirSuccessHaveDirectory() { if (HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; @@ -839,6 +905,20 @@ class TestAzureFileSystem : public ::testing::Test { } } + void TestDeleteDirContentsSuccessExistWithTrailingSlash() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto preexisting_data = SetUpPreexistingData(); + HierarchicalPaths paths; + CreateHierarchicalData(&paths); + ASSERT_OK(fs()->DeleteDirContents(paths.directory + "/")); + AssertFileInfo(fs(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + AssertFileInfo(fs(), sub_path, FileType::NotFound); + } + } + void TestDeleteDirContentsSuccessNonexistent() { if (HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; @@ -854,6 +934,393 @@ class TestAzureFileSystem : public ::testing::Test { const auto directory_path = data.RandomDirectoryPath(rng_); ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } + + private: + using StringMatcher = + ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; + + StringMatcher HasDirMoveToSubdirMessage(const std::string& src, + const std::string& dest) { + return ::testing::HasSubstr("Cannot Move to '" + dest + "' and make '" + src + + "' a sub-directory of itself."); + } + + StringMatcher HasCrossContainerNotImplementedMessage(const std::string& container_name, + const std::string& dest) { + return ::testing::HasSubstr("Move of '" + container_name + "' to '" + dest + + "' requires moving data between " + "containers, which is not implemented."); + } + + StringMatcher HasMissingParentDirMessage(const std::string& dest) { + return ::testing::HasSubstr("The parent directory of the destination path '" + dest + + "' does not exist."); + } + + /// \brief Expected POSIX semantics for the rename operation on multiple + /// scenarios. + /// + /// If the src doesn't exist, the error is always ENOENT, otherwise we are + /// left with the following combinations: + /// + /// 1. src's type + /// a. File + /// b. Directory + /// 2. dest's existence + /// a. NotFound + /// b. File + /// c. Directory + /// - empty + /// - non-empty + /// 3. src path has a trailing slash (or not) + /// 4. dest path has a trailing slash (or not) + /// + /// Limitations: this function doesn't consider paths so it assumes that the + /// paths don't lead requests for moves that would make the source a subdir of + /// the destination. + /// + /// \param paths_are_equal src and dest paths without trailing slashes are equal + /// \return std::nullopt if success is expected in the scenario or the errno + /// if failure is expected. + static std::optional RenameSemantics(FileType src_type, bool src_trailing_slash, + FileType dest_type, bool dest_trailing_slash, + bool dest_is_empty_dir = false, + bool paths_are_equal = false) { + DCHECK(src_type != FileType::Unknown && dest_type != FileType::Unknown); + DCHECK(!dest_is_empty_dir || dest_type == FileType::Directory) + << "dest_is_empty_dir must imply dest_type == FileType::Directory"; + switch (src_type) { + case FileType::Unknown: + break; + case FileType::NotFound: + return {ENOENT}; + case FileType::File: + switch (dest_type) { + case FileType::Unknown: + break; + case FileType::NotFound: + if (src_trailing_slash) { + return {ENOTDIR}; + } + if (dest_trailing_slash) { + // A slash on the destination path requires that it exists, + // so a confirmation that it's a directory can be performed. + return {ENOENT}; + } + return {}; + case FileType::File: + if (src_trailing_slash || dest_trailing_slash) { + return {ENOTDIR}; + } + // The existing file is replaced successfuly. + return {}; + case FileType::Directory: + if (src_trailing_slash) { + return {ENOTDIR}; + } + return EISDIR; + } + break; + case FileType::Directory: + switch (dest_type) { + case FileType::Unknown: + break; + case FileType::NotFound: + // We don't have to care about the slashes when the source is a directory. + return {}; + case FileType::File: + return {ENOTDIR}; + case FileType::Directory: + if (!paths_are_equal && !dest_is_empty_dir) { + return {ENOTEMPTY}; + } + return {}; + } + break; + } + Unreachable("Invalid parameters passed to RenameSemantics"); + } + + Status CheckExpectedErrno(const std::string& src, const std::string& dest, + std::optional expected_errno, + const char* expected_errno_name, FileInfo* out_src_info) { + auto the_fs = fs(); + const bool src_trailing_slash = internal::HasTrailingSlash(src); + const bool dest_trailing_slash = internal::HasTrailingSlash(dest); + const auto src_path = std::string{internal::RemoveTrailingSlash(src)}; + const auto dest_path = std::string{internal::RemoveTrailingSlash(dest)}; + ARROW_ASSIGN_OR_RAISE(*out_src_info, the_fs->GetFileInfo(src_path)); + ARROW_ASSIGN_OR_RAISE(auto dest_info, the_fs->GetFileInfo(dest_path)); + bool dest_is_empty_dir = false; + if (dest_info.type() == FileType::Directory) { + FileSelector select; + select.base_dir = dest_path; + select.recursive = false; + // TODO(ARROW-40014): investigate why this can't be false here + select.allow_not_found = true; + ARROW_ASSIGN_OR_RAISE(auto dest_contents, the_fs->GetFileInfo(select)); + if (dest_contents.empty()) { + dest_is_empty_dir = true; + } + } + auto paths_are_equal = src_path == dest_path; + auto truly_expected_errno = + RenameSemantics(out_src_info->type(), src_trailing_slash, dest_info.type(), + dest_trailing_slash, dest_is_empty_dir, paths_are_equal); + if (truly_expected_errno != expected_errno) { + if (expected_errno.has_value()) { + return Status::Invalid("expected_errno=", expected_errno_name, "=", + *expected_errno, + " used in ASSERT_MOVE is incorrect. " + "POSIX semantics for this scenario require errno=", + strerror(truly_expected_errno.value_or(0))); + } else { + DCHECK(truly_expected_errno.has_value()); + return Status::Invalid( + "ASSERT_MOVE used to assert success in a scenario for which " + "POSIX semantics requires errno=", + strerror(*truly_expected_errno)); + } + } + return Status::OK(); + } + + void AssertAfterMove(const std::string& src, const std::string& dest, FileType type) { + if (internal::RemoveTrailingSlash(src) != internal::RemoveTrailingSlash(dest)) { + AssertFileInfo(fs(), src, FileType::NotFound); + } + AssertFileInfo(fs(), dest, type); + } + + static bool WithErrno(const Status& status, int expected_errno) { + auto* detail = status.detail().get(); + return detail && + arrow::internal::ErrnoFromStatusDetail(*detail).value_or(-1) == expected_errno; + } + + std::optional MoveErrorMessageMatcher(const FileInfo& src_info, + const std::string& src, + const std::string& dest, + int for_errno) { + switch (for_errno) { + case ENOENT: { + auto& path = src_info.type() == FileType::NotFound ? src : dest; + return ::testing::HasSubstr("Path does not exist '" + path + "'"); + } + case ENOTEMPTY: + return ::testing::HasSubstr("Directory not empty: '" + dest + "'"); + } + return std::nullopt; + } + +#define ASSERT_MOVE(src, dest, expected_errno) \ + do { \ + auto _src = (src); \ + auto _dest = (dest); \ + std::optional _expected_errno = (expected_errno); \ + FileInfo _src_info; \ + ASSERT_OK( \ + CheckExpectedErrno(_src, _dest, _expected_errno, #expected_errno, &_src_info)); \ + auto _move_st = ::arrow::internal::GenericToStatus(fs()->Move(_src, _dest)); \ + if (_expected_errno.has_value()) { \ + if (WithErrno(_move_st, *_expected_errno)) { \ + /* If the Move failed, the source should remain unchanged. */ \ + AssertFileInfo(fs(), std::string{internal::RemoveTrailingSlash(_src)}, \ + _src_info.type()); \ + auto _message_matcher = \ + MoveErrorMessageMatcher(_src_info, _src, _dest, *_expected_errno); \ + if (_message_matcher.has_value()) { \ + EXPECT_RAISES_WITH_MESSAGE_THAT(IOError, *_message_matcher, _move_st); \ + } else { \ + SUCCEED(); \ + } \ + } else { \ + FAIL() << "Move '" ARROW_STRINGIFY(src) "' to '" ARROW_STRINGIFY(dest) \ + << "' did not fail with errno=" << #expected_errno; \ + } \ + } else { \ + if (!_move_st.ok()) { \ + FAIL() << "Move '" ARROW_STRINGIFY(src) "' to '" ARROW_STRINGIFY(dest) \ + << "' failed with " << _move_st.ToString(); \ + } else { \ + AssertAfterMove(_src, _dest, _src_info.type()); \ + } \ + } \ + } while (false) + +#define ASSERT_MOVE_OK(src, dest) ASSERT_MOVE((src), (dest), std::nullopt) + + // Tests for Move() + + public: + void TestRenameContainer() { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + auto data = SetUpPreexistingData(); + // Container exists, so renaming to the same name succeeds because it's a no-op. + ASSERT_MOVE_OK(data.container_name, data.container_name); + // Renaming a container that doesn't exist fails. + ASSERT_MOVE("missing-container", "missing-container", ENOENT); + ASSERT_MOVE("missing-container", data.container_name, ENOENT); + // Renaming a container to an existing non-empty container fails. + auto non_empty_container = PreexistingData::RandomContainerName(rng_); + auto non_empty_container_client = CreateContainer(non_empty_container); + CreateBlob(non_empty_container_client, "object1", PreexistingData::kLoremIpsum); + ASSERT_MOVE(data.container_name, non_empty_container, ENOTEMPTY); + // Renaming to an empty container fails to replace it + auto empty_container = PreexistingData::RandomContainerName(rng_); + auto empty_container_client = CreateContainer(empty_container); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Unable to replace empty container: '" + empty_container + + "'"), + fs()->Move(data.container_name, empty_container)); + // Renaming to a non-existing container creates it + auto missing_container = PreexistingData::RandomContainerName(rng_); + AssertFileInfo(fs(), missing_container, FileType::NotFound); + if (env->backend() == AzureBackend::kAzurite) { + // Azurite returns a 201 Created for RenameBlobContainer, but the created + // container doesn't contain the blobs from the source container and + // the source container remains undeleted after the "rename". + } else { + // See Azure SDK issue/question: + // https://github.com/Azure/azure-sdk-for-cpp/issues/5262 + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("The 'rename' operation is not supported on containers."), + fs()->Move(data.container_name, missing_container)); + // ASSERT_MOVE_OK(data.container_name, missing_container); + // AssertFileInfo(fs(), + // ConcatAbstractPath(missing_container, + // PreexistingData::kObjectName), FileType::File); + } + // Renaming to an empty container can work if the source is also empty + auto new_empty_container = PreexistingData::RandomContainerName(rng_); + auto new_empty_container_client = CreateContainer(new_empty_container); + ASSERT_MOVE_OK(empty_container, new_empty_container); + } + + void TestMoveContainerToPath() { + auto data = SetUpPreexistingData(); + ASSERT_MOVE("missing-container", data.ContainerPath("new-subdir"), ENOENT); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + HasDirMoveToSubdirMessage(data.container_name, data.ContainerPath("new-subdir")), + fs()->Move(data.container_name, data.ContainerPath("new-subdir"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, + HasCrossContainerNotImplementedMessage(data.container_name, + "missing-container/new-subdir"), + fs()->Move(data.container_name, "missing-container/new-subdir")); + } + + void TestCreateContainerFromPath() { + auto data = SetUpPreexistingData(); + auto missing_path = data.RandomDirectoryPath(rng_); + ASSERT_MOVE(missing_path, "new-container", ENOENT); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + ::testing::HasSubstr("Creating files at '/' is not possible, only directories."), + fs()->Move(data.ObjectPath(), "new-file")); + auto src_dir_path = data.RandomDirectoryPath(rng_); + ASSERT_OK(fs()->CreateDir(src_dir_path, false)); + EXPECT_OK_AND_ASSIGN(auto src_dir_info, fs()->GetFileInfo(src_dir_path)); + EXPECT_EQ(src_dir_info.type(), FileType::Directory); + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, + HasCrossContainerNotImplementedMessage(src_dir_path, "new-container"), + fs()->Move(src_dir_path, "new-container")); + } + + void TestMovePath() { + Status st; + auto data = SetUpPreexistingData(); + // When source doesn't exist. + ASSERT_MOVE("missing-container/src-path", data.ContainerPath("dest-path"), ENOENT); + auto missing_path1 = data.RandomDirectoryPath(rng_); + ASSERT_MOVE(missing_path1, "missing-container/path", ENOENT); + + // But when source exists... + if (!WithHierarchicalNamespace()) { + // ...and containers are different, we get an error message telling cross-container + // moves are not implemented. + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, + HasCrossContainerNotImplementedMessage(data.ObjectPath(), + "missing-container/path"), + fs()->Move(data.ObjectPath(), "missing-container/path")); + GTEST_SKIP() << "The rest of TestMovePath is not implemented for non-HNS scenarios"; + } + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + // ...and dest.container doesn't exist. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, HasMissingParentDirMessage("missing-container/path"), + fs()->Move(data.ObjectPath(), "missing-container/path")); + AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, HasMissingParentDirMessage(data.Path("missing-subdir/file")), + fs()->Move(data.ObjectPath(), data.Path("missing-subdir/file"))); + AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + + // src is a file and dest does not exists + ASSERT_MOVE_OK(data.ObjectPath(), data.Path("file0")); + ASSERT_MOVE(data.Path("file0/"), data.Path("file1"), ENOTDIR); + ASSERT_MOVE(data.Path("file0"), data.Path("file1/"), ENOENT); + ASSERT_MOVE(data.Path("file0/"), data.Path("file1/"), ENOTDIR); + // "file0" exists + + // src is a file and dest exists (as a file) + CreateFile(adlfs_client, PreexistingData::kObjectName, PreexistingData::kLoremIpsum); + CreateFile(adlfs_client, "file1", PreexistingData::kLoremIpsum); + ASSERT_MOVE_OK(data.ObjectPath(), data.Path("file0")); + ASSERT_MOVE(data.Path("file1/"), data.Path("file0"), ENOTDIR); + ASSERT_MOVE(data.Path("file1"), data.Path("file0/"), ENOTDIR); + ASSERT_MOVE(data.Path("file1/"), data.Path("file0/"), ENOTDIR); + // "file0" and "file1" exist + + // src is a file and dest exists (as an empty dir) + CreateDirectory(adlfs_client, "subdir0"); + ASSERT_MOVE(data.Path("file0"), data.Path("subdir0"), EISDIR); + ASSERT_MOVE(data.Path("file0/"), data.Path("subdir0"), ENOTDIR); + ASSERT_MOVE(data.Path("file0"), data.Path("subdir0/"), EISDIR); + ASSERT_MOVE(data.Path("file0/"), data.Path("subdir0/"), ENOTDIR); + + // src is a file and dest exists (as a non-empty dir) + CreateFile(adlfs_client, "subdir0/file-at-subdir"); + ASSERT_MOVE(data.Path("file0"), data.Path("subdir0"), EISDIR); + ASSERT_MOVE(data.Path("file0/"), data.Path("subdir0"), ENOTDIR); + ASSERT_MOVE(data.Path("file0"), data.Path("subdir0/"), EISDIR); + ASSERT_MOVE(data.Path("file0/"), data.Path("subdir0/"), ENOTDIR); + // "subdir0/file-at-subdir" exists + + // src is a directory and dest does not exists + ASSERT_MOVE_OK(data.Path("subdir0"), data.Path("subdir1")); + ASSERT_MOVE_OK(data.Path("subdir1/"), data.Path("subdir2")); + ASSERT_MOVE_OK(data.Path("subdir2"), data.Path("subdir3/")); + ASSERT_MOVE_OK(data.Path("subdir3/"), data.Path("subdir4/")); + AssertFileInfo(fs(), data.Path("subdir4/file-at-subdir"), FileType::File); + // "subdir4/file-at-subdir" exists + + // src is a directory and dest exists as an empty directory + CreateDirectory(adlfs_client, "subdir0"); + CreateDirectory(adlfs_client, "subdir1"); + CreateDirectory(adlfs_client, "subdir2"); + CreateDirectory(adlfs_client, "subdir3"); + ASSERT_MOVE_OK(data.Path("subdir4"), data.Path("subdir0")); + ASSERT_MOVE_OK(data.Path("subdir0/"), data.Path("subdir1")); + ASSERT_MOVE_OK(data.Path("subdir1"), data.Path("subdir2/")); + ASSERT_MOVE_OK(data.Path("subdir2/"), data.Path("subdir3/")); + AssertFileInfo(fs(), data.Path("subdir3/file-at-subdir"), FileType::File); + // "subdir3/file-at-subdir" exists + + // src is directory and dest exists as a non-empty directory + CreateDirectory(adlfs_client, "subdir0"); + ASSERT_MOVE(data.Path("subdir0"), data.Path("subdir3"), ENOTEMPTY); + ASSERT_MOVE(data.Path("subdir0/"), data.Path("subdir3"), ENOTEMPTY); + ASSERT_MOVE(data.Path("subdir0"), data.Path("subdir3/"), ENOTEMPTY); + ASSERT_MOVE(data.Path("subdir0/"), data.Path("subdir3/"), ENOTEMPTY); + } }; void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) { @@ -940,10 +1407,9 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { FileType::NotFound); if (WithHierarchicalNamespace()) { - datalake_service_client_->GetFileSystemClient(data.container_name) - .GetDirectoryClient("test-empty-object-dir") - .Create(); - + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + CreateDirectory(adlfs_client, "test-empty-object-dir"); AssertFileInfo(fs(), data.ContainerPath("test-empty-object-dir"), FileType::Directory); } @@ -1046,6 +1512,10 @@ TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirWithEmptyPath) { TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRoot) { this->TestCreateDirOnRoot(); } +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRootWithTrailingSlash) { + this->TestCreateDirOnRootWithTrailingSlash(); +} + // Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) // combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and // known according to the environment. @@ -1076,6 +1546,11 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnExistingContainer) { this->TestCreateDirOnExistingContainer(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + CreateDirOnExistingContainerWithTrailingSlash) { + this->TestCreateDirOnExistingContainerWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnMissingContainer) { this->TestCreateDirOnMissingContainer(); } @@ -1092,6 +1567,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveBlob) { this->TestDeleteDirSuccessHaveBlob(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, NonEmptyDirWithTrailingSlash) { + this->TestNonEmptyDirWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveDirectory) { this->TestDeleteDirSuccessHaveDirectory(); } @@ -1100,6 +1579,11 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessExist) { this->TestDeleteDirContentsSuccessExist(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + DeleteDirContentsSuccessExistWithTrailingSlash) { + this->TestDeleteDirContentsSuccessExistWithTrailingSlash(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessNonexistent) { this->TestDeleteDirContentsSuccessNonexistent(); } @@ -1108,6 +1592,20 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexisten this->TestDeleteDirContentsFailureNonexistent(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, RenameContainer) { + this->TestRenameContainer(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, MoveContainerToPath) { + this->TestMoveContainerToPath(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) { + this->TestCreateContainerFromPath(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); } + // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { @@ -1382,6 +1880,57 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } +TEST_F(TestAzuriteFileSystem, DeleteFileSuccess) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + const auto file_name = ConcatAbstractPath(container_name, "filename"); + if (WithHierarchicalNamespace()) { + auto adlfs_client = CreateFilesystem(container_name); + CreateFile(adlfs_client, "filename", "data"); + } else { + auto container = CreateContainer(container_name); + CreateBlob(container, "filename", "data"); + } + arrow::fs::AssertFileInfo(fs(), file_name, FileType::File); + ASSERT_OK(fs()->DeleteFile(file_name)); + arrow::fs::AssertFileInfo(fs(), file_name, FileType::NotFound); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureNonexistent) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + const auto nonexistent_file_name = ConcatAbstractPath(container_name, "nonexistent"); + if (WithHierarchicalNamespace()) { + ARROW_UNUSED(CreateFilesystem(container_name)); + } else { + ARROW_UNUSED(CreateContainer(container_name)); + } + ASSERT_RAISES(IOError, fs()->DeleteFile(nonexistent_file_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureContainer) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + if (WithHierarchicalNamespace()) { + ARROW_UNUSED(CreateFilesystem(container_name)); + } else { + ARROW_UNUSED(CreateContainer(container_name)); + } + arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureDirectory) { + auto container_name = PreexistingData::RandomContainerName(rng_); + if (WithHierarchicalNamespace()) { + auto adlfs_client = CreateFilesystem(container_name); + CreateDirectory(adlfs_client, "directory"); + } else { + auto container = CreateContainer(container_name); + CreateBlob(container, "directory/"); + } + auto directory_path = ConcatAbstractPath(container_name, "directory"); + arrow::fs::AssertFileInfo(fs(), directory_path, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(directory_path)); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); @@ -1510,7 +2059,7 @@ std::shared_ptr NormalizerKeyValueMetadata( auto value = metadata->value(i); if (key == "Content-Hash") { std::vector output; - output.reserve(value.size() / 2); + output.resize(value.size() / 2); if (ParseHexValues(value, output.data()).ok()) { // Valid value value = std::string(value.size(), 'F'); @@ -1868,6 +2417,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); ASSERT_RAISES(Invalid, stream->Seek(2)); } - } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 1ca5af27fc895..8747f9683b90f 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -17,6 +17,7 @@ #include "arrow/filesystem/util_internal.h" +#include #include #include "arrow/buffer.h" @@ -63,11 +64,21 @@ Status PathNotFound(std::string_view path) { .WithDetail(StatusDetailFromErrno(ENOENT)); } +Status IsADir(std::string_view path) { + return Status::IOError("Is a directory: '", path, "'") + .WithDetail(StatusDetailFromErrno(EISDIR)); +} + Status NotADir(std::string_view path) { return Status::IOError("Not a directory: '", path, "'") .WithDetail(StatusDetailFromErrno(ENOTDIR)); } +Status NotEmpty(std::string_view path) { + return Status::IOError("Directory not empty: '", path, "'") + .WithDetail(StatusDetailFromErrno(ENOTEMPTY)); +} + Status NotAFile(std::string_view path) { return Status::IOError("Not a regular file: '", path, "'"); } diff --git a/cpp/src/arrow/filesystem/util_internal.h b/cpp/src/arrow/filesystem/util_internal.h index 29a51512d0aa2..96cc5178a9f31 100644 --- a/cpp/src/arrow/filesystem/util_internal.h +++ b/cpp/src/arrow/filesystem/util_internal.h @@ -43,9 +43,15 @@ Status CopyStream(const std::shared_ptr& src, ARROW_EXPORT Status PathNotFound(std::string_view path); +ARROW_EXPORT +Status IsADir(std::string_view path); + ARROW_EXPORT Status NotADir(std::string_view path); +ARROW_EXPORT +Status NotEmpty(std::string_view path); + ARROW_EXPORT Status NotAFile(std::string_view path); diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 843329c17bc28..d58c203d2ae27 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -195,7 +195,7 @@ bool IsDebugEnabled() { return false; } auto env_value = *std::move(maybe_env_value); - if (env_value.empty()) { + if (env_value.empty() || env_value == "none") { return false; } auto debug_state = DebugState::Instance(); @@ -212,7 +212,7 @@ bool IsDebugEnabled() { return true; } ARROW_LOG(WARNING) << "Invalid value for " << kDebugMemoryEnvVar << ": '" << env_value - << "'. Valid values are 'abort', 'trap', 'warn'."; + << "'. Valid values are 'abort', 'trap', 'warn', 'none'."; return false; }(); diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 751ef28d415e0..b693336e09921 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -449,6 +449,13 @@ std::shared_ptr StatusDetailFromErrno(int errnum) { return std::make_shared(errnum); } +std::optional ErrnoFromStatusDetail(const StatusDetail& detail) { + if (detail.type_id() == kErrnoDetailTypeId) { + return checked_cast(detail).errnum(); + } + return std::nullopt; +} + #if _WIN32 std::shared_ptr StatusDetailFromWinError(int errnum) { if (!errnum) { diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h index 113b1bdd93103..bba71c0d80ad4 100644 --- a/cpp/src/arrow/util/io_util.h +++ b/cpp/src/arrow/util/io_util.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -264,6 +265,8 @@ std::string WinErrorMessage(int errnum); ARROW_EXPORT std::shared_ptr StatusDetailFromErrno(int errnum); +ARROW_EXPORT +std::optional ErrnoFromStatusDetail(const StatusDetail& detail); #if _WIN32 ARROW_EXPORT std::shared_ptr StatusDetailFromWinError(int errnum); diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index d773fb5ff5895..9352ac5c4a938 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -243,6 +243,7 @@ endfunction() add_gandiva_test(internals-test SOURCES bitmap_accumulator_test.cc + cache_test.cc engine_llvm_test.cc function_registry_test.cc function_signature_test.cc diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index f7e3e5e9f8f1f..a1333ccdc5d43 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -23,11 +23,7 @@ namespace gandiva { -#ifdef GANDIVA_ENABLE_OBJECT_CODE_CACHE -static const size_t DEFAULT_CACHE_SIZE = 500000; -#else -static const size_t DEFAULT_CACHE_SIZE = 500; -#endif +static const size_t DEFAULT_CACHE_SIZE = 5000; int GetCapacity() { size_t capacity = DEFAULT_CACHE_SIZE; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc new file mode 100644 index 0000000000000..a146707079fa6 --- /dev/null +++ b/cpp/src/gandiva/cache_test.cc @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/cache.h" + +#include + +namespace gandiva { +class TestCacheKey { + public: + explicit TestCacheKey(int value) : value_(value) {} + std::size_t Hash() const { return value_; } + bool operator==(const TestCacheKey& other) const { return value_ == other.value_; } + + private: + int value_; +}; + +TEST(TestCache, TestGetPut) { + Cache cache(2); + cache.PutObjectCode(TestCacheKey(1), "hello"); + cache.PutObjectCode(TestCacheKey(2), "world"); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(1)), "hello"); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); +} + +TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +} // namespace gandiva diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index dd3f5da84f777..18bb6c9b6e09c 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -66,8 +66,8 @@ ARROW_CARES_BUILD_VERSION=1.17.2 ARROW_CARES_BUILD_SHA256_CHECKSUM=4803c844ce20ce510ef0eb83f8ea41fa24ecaae9d280c468c582d2bb25b3913d ARROW_CRC32C_BUILD_VERSION=1.1.2 ARROW_CRC32C_BUILD_SHA256_CHECKSUM=ac07840513072b7fcebda6e821068aa04889018f24e10e46181068fb214d7e56 -ARROW_GBENCHMARK_BUILD_VERSION=v1.7.1 -ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6430e4092653380d9dc4ccb45a1e2dc9259d581f4866dc0759713126056bc1d7 +ARROW_GBENCHMARK_BUILD_VERSION=v1.8.3 +ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6bc180a57d23d4d9515519f92b0c83d61b05b5bab188961f36ac7b06b0d9e9ce ARROW_GFLAGS_BUILD_VERSION=v2.2.2 ARROW_GFLAGS_BUILD_SHA256_CHECKSUM=34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf ARROW_GLOG_BUILD_VERSION=v0.5.0 diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index fded62911262c..6988567193db4 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -1,10 +1,16 @@ - netstandard2.0 Provides decompression support for the Arrow IPC format + + netstandard2.0;net462 + + + netstandard2.0 + + diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 3a229f4ffcaf8..c4bb64b73a9ed 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -1,14 +1,20 @@ - netstandard2.0;net6.0 true $(DefineConstants);UNSAFE_BYTEBUFFER;BYTEBUFFER_NO_BOUNDS_CHECK;ENABLE_SPAN_T Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. - + + netstandard2.0;net6.0;net462 + + + netstandard2.0;net6.0 + + + @@ -34,7 +40,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs index fe42075f14f73..e0e0f5707086b 100644 --- a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs +++ b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs @@ -25,5 +25,12 @@ public static void Deconstruct(this Tuple value, out T1 item1, o item1 = value.Item1; item2 = value.Item2; } + + public static void Deconstruct(this Tuple value, out T1 item1, out T2 item2, out T3 item3) + { + item1 = value.Item1; + item2 = value.Item2; + item3 = value.Item3; + } } } diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index f5e2a0ef8e16e..dd27e790d9d4f 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index b5e7170a8c31d..3d7b415599907 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index a7c52846fd9a4..3038376327e70 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index d8a92ff756751..48f5747b790b3 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -7,14 +7,14 @@ - net7.0;net472 + net7.0;net472;net462 net7.0 - + all diff --git a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs index 4c2b050d0c8ba..447572dda0eea 100644 --- a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs @@ -83,7 +83,7 @@ public void AppendSingleByte(byte[][] initialContents, byte singleByte) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(new[] { singleByte }); + var expectedArrayContents = initialContents.Concat(new[] { new[] { singleByte } }); // Act var actualReturnValue = builder.Append(singleByte); @@ -130,7 +130,7 @@ public void AppendNull(byte[][] initialContents) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(null); + var expectedArrayContents = initialContents.Concat(new byte[][] { null }); // Act var actualReturnValue = builder.AppendNull(); @@ -180,7 +180,7 @@ public void AppendReadOnlySpan(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; var span = (ReadOnlySpan)bytes; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(span); @@ -230,7 +230,7 @@ public void AppendEnumerable(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; int expectedLength = initialLength + 1; var enumerable = (IEnumerable)bytes; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(enumerable); diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py index 4e5104362254c..2b87b9386877f 100644 --- a/dev/archery/archery/bot.py +++ b/dev/archery/archery/bot.py @@ -324,7 +324,7 @@ def crossbow(obj, crossbow): obj['crossbow_repo'] = crossbow -def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): +def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, pr_number): """ Clone the repositories and initialize crossbow objects. @@ -334,26 +334,31 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): Filesystem path to clone the repositories to. crossbow_repo : str GitHub repository name, like kszucs/crossbow. - pull_request : pygithub.PullRequest - Object containing information about the pull request the comment bot - was triggered from. + arrow_repo_url : str + Target Apache Arrow repository's clone URL, such as + "https://github.com/apache/arrow.git". + pr_number : int + Target PR number. """ arrow_path = dest / 'arrow' queue_path = dest / 'crossbow' - # clone arrow and checkout the pull request's branch - pull_request_ref = 'pull/{}/head:{}'.format( - pull_request.number, pull_request.head.ref - ) - git.clone(pull_request.base.repo.clone_url, str(arrow_path)) - git.fetch('origin', pull_request_ref, git_dir=arrow_path) - git.checkout(pull_request.head.ref, git_dir=arrow_path) - - # clone crossbow repository + # we use unique branch name instead of fork's branch name to avoid + # branch name conflict such as 'main' (GH-39996) + local_branch = f'archery/pr-{pr_number}' + # 1. clone arrow and checkout the PR's branch + pr_ref = f'pull/{pr_number}/head:{local_branch}' + git.clone('--no-checkout', arrow_repo_url, str(arrow_path)) + # fetch the PR's branch into the clone + git.fetch('origin', pr_ref, git_dir=arrow_path) + # checkout the PR's branch into the clone + git.checkout(local_branch, git_dir=arrow_path) + + # 2. clone crossbow repository crossbow_url = 'https://github.com/{}'.format(crossbow_repo) git.clone(crossbow_url, str(queue_path)) - # initialize crossbow objects + # 3. initialize crossbow objects github_token = os.environ['CROSSBOW_GITHUB_TOKEN'] arrow = Repo(arrow_path) queue = Queue(queue_path, github_token=github_token, require_https=True) @@ -385,7 +390,8 @@ def submit(obj, tasks, groups, params, arrow_version, wait): arrow, queue = _clone_arrow_and_crossbow( dest=Path(tmpdir), crossbow_repo=crossbow_repo, - pull_request=pull_request, + arrow_repo_url=pull_request.base.repo.clone_url, + pr_number=pull_request.number, ) # load available tasks configuration and groups from yaml config = Config.load_yaml(arrow.path / "dev" / "tasks" / "tasks.yml") diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index 0c1b98ab32c95..8fcdcf1f5f442 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1088,7 +1088,6 @@ def available_apt_targets ["debian", "trixie", "main"], ["ubuntu", "focal", "main"], ["ubuntu", "jammy", "main"], - ["ubuntu", "mantic", "main"], ["ubuntu", "noble", "main"], ] end @@ -2122,8 +2121,6 @@ def apt_test_targets_default # "ubuntu-focal-arm64", "ubuntu-jammy", # "ubuntu-jammy-arm64", - "ubuntu-mantic", - # "ubuntu-mantic-arm64", "ubuntu-noble", # "ubuntu-noble-arm64", ] diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index a61b5ba094c8a..08b3feac13e2f 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -195,8 +195,6 @@ test_apt() { "arm64v8/ubuntu:focal" \ "ubuntu:jammy" \ "arm64v8/ubuntu:jammy" \ - "ubuntu:mantic" \ - "arm64v8/ubuntu:mantic" \ "ubuntu:noble" \ "arm64v8/ubuntu:noble"; do \ case "${target}" in diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-mantic/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-mantic/Dockerfile deleted file mode 100644 index b5a61282b30fc..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-mantic/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:mantic - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic-arm64/from deleted file mode 100644 index 247faef234794..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/ubuntu:mantic diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic/Dockerfile deleted file mode 100644 index 9e90e08d26513..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-mantic/Dockerfile +++ /dev/null @@ -1,85 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=ubuntu:mantic -FROM ${FROM} - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - clang \ - clang-tools \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libmlir-15-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-dev \ - lsb-release \ - meson \ - mlir-15-tools \ - ninja-build \ - nlohmann-json3-dev \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - python3-setuptools \ - rapidjson-dev \ - tzdata \ - valac \ - zlib1g-dev && \ - if apt list | grep -q '^libcuda1'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - else \ - :; \ - fi && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index 51fe0b9a75b0c..3a9e5e48b4585 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -277,8 +277,6 @@ def apt_targets_default # "ubuntu-focal-arm64", "ubuntu-jammy", # "ubuntu-jammy-arm64", - "ubuntu-mantic", - # "ubuntu-mantic-arm64", "ubuntu-noble", # "ubuntu-noble-arm64", ] diff --git a/dev/tasks/python-wheels/github.osx.amd64.yml b/dev/tasks/python-wheels/github.osx.amd64.yml index 526412f84214b..e31a681653b37 100644 --- a/dev/tasks/python-wheels/github.osx.amd64.yml +++ b/dev/tasks/python-wheels/github.osx.amd64.yml @@ -85,6 +85,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 35d74f1462453..380c2e42f1d88 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -71,6 +71,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index cf04d29715306..c2321453052dc 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -465,7 +465,6 @@ tasks: "debian-trixie", "ubuntu-focal", "ubuntu-jammy", - "ubuntu-mantic", "ubuntu-noble"] %} {% for architecture in ["amd64", "arm64"] %} {{ target }}-{{ architecture }}: @@ -1104,12 +1103,12 @@ tasks: image: debian-cpp {% endfor %} - test-fedora-38-cpp: + test-fedora-39-cpp: ci: github template: docker-tests/github.linux.yml params: env: - FEDORA: 38 + FEDORA: 39 image: fedora-cpp {% for cpp_standard in [20] %} @@ -1217,12 +1216,12 @@ tasks: UBUNTU: 22.04 image: ubuntu-python - test-fedora-38-python-3: + test-fedora-39-python-3: ci: azure template: docker-tests/azure.linux.yml params: env: - FEDORA: 38 + FEDORA: 39 image: fedora-python test-r-linux-valgrind: diff --git a/docker-compose.yml b/docker-compose.yml index 8a7223b57632f..aec685775aab1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -320,6 +320,8 @@ services: # Shrink test runtime by enabling minimal optimizations ARROW_C_FLAGS_DEBUG: "-g1 -Og" ARROW_CXX_FLAGS_DEBUG: "-g1 -Og" + # GH-39973: Do not use debug memory pool for valgrind + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_FLIGHT: "OFF" ARROW_FLIGHT_SQL: "OFF" @@ -598,6 +600,8 @@ services: CXX: clang++-${CLANG_TOOLS} # Avoid creating huge static libraries ARROW_BUILD_STATIC: "OFF" + # GH-39973: Do not use debug memory pool for ASAN + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit # GH-33920: Disable Flight SQL to reduce build time. # We'll be able to re-enable this with Ubuntu 24.04 because @@ -654,7 +658,7 @@ services: # docker-compose run --rm fedora-cpp # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 38 + # FEDORA: 39 image: ${REPO}:${ARCH}-fedora-${FEDORA}-cpp build: context: . @@ -955,7 +959,7 @@ services: # docker-compose run --rm fedora-python # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 38 + # FEDORA: 39 image: ${REPO}:${ARCH}-fedora-${FEDORA}-python-3 build: context: . @@ -1740,6 +1744,7 @@ services: args: r: ${R} jdk: ${JDK} + maven: ${MAVEN} node: ${NODE} base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 environment: diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 0fa80aa1106c1..eb7c797df5e27 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -58,8 +58,10 @@ that changing their value later will have an effect. - ``abort`` exits the processus with a non-zero return value; - ``trap`` issues a platform-specific debugger breakpoint / trap instruction; - ``warn`` prints a warning on stderr and continues execution; + - ``none`` disables memory checks; - If this variable is not set, or has empty an value, memory checks are disabled. + If this variable is not set, or has an empty value, it has the same effect + as the value ``none`` - memory checks are disabled. .. note:: While this functionality can be useful and has little overhead, it diff --git a/docs/source/cpp/tutorials/basic_arrow.rst b/docs/source/cpp/tutorials/basic_arrow.rst index 409dfcc40d28f..0b7afa6130373 100644 --- a/docs/source/cpp/tutorials/basic_arrow.rst +++ b/docs/source/cpp/tutorials/basic_arrow.rst @@ -25,15 +25,15 @@ Basic Arrow Data Structures =========================== Apache Arrow provides fundamental data structures for representing data: -:class:`Array`, :class:`ChunkedArray`, :class:`RecordBatch`, and :class:`Table`. -This article shows how to construct these data structures from primitive -data types; specifically, we will work with integers of varying size +:class:`Array`, :class:`ChunkedArray`, :class:`RecordBatch`, and :class:`Table`. +This article shows how to construct these data structures from primitive +data types; specifically, we will work with integers of varying size representing days, months, and years. We will use them to create the following data structures: #. Arrow :class:`Arrays ` -#. :class:`ChunkedArrays` +#. :class:`ChunkedArrays` #. :class:`RecordBatch`, from :class:`Arrays ` -#. :class:`Table`, from :class:`ChunkedArrays` +#. :class:`Table`, from :class:`ChunkedArrays` Pre-requisites -------------- @@ -50,14 +50,14 @@ Setup Before trying out Arrow, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. ``A main()`` is needed to glue things together. Includes ^^^^^^^^ First, as ever, we need some includes. We'll get ``iostream`` for output, then import Arrow's basic -functionality from ``api.h``, like so: +functionality from ``api.h``, like so: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc :language: cpp @@ -75,14 +75,14 @@ following: :start-after: (Doc section: Main) :end-before: (Doc section: Main) -This allows us to easily use Arrow’s error-handling macros, which will +This allows us to easily use Arrow's error-handling macros, which will return back to ``main()`` with a :class:`arrow::Status` object if a failure occurs – and this ``main()`` will report the error. Note that this means Arrow never raises exceptions, instead relying upon returning :class:`Status`. For more on that, read here: :doc:`/cpp/conventions`. To accompany this ``main()``, we have a ``RunMain()`` from which any :class:`Status` -objects can return – this is where we’ll write the rest of the program: +objects can return – this is where we'll write the rest of the program: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc :language: cpp @@ -97,14 +97,14 @@ Building int8 Arrays ^^^^^^^^^^^^^^^^^^^^ Given that we have some data in standard C++ arrays, and want to use Arrow, we need to move -the data from said arrays into Arrow arrays. We still guarantee contiguity of memory in an +the data from said arrays into Arrow arrays. We still guarantee contiguity of memory in an :class:`Array`, so no worries about a performance loss when using :class:`Array` vs C++ arrays. -The easiest way to construct an :class:`Array` uses an :class:`ArrayBuilder`. +The easiest way to construct an :class:`Array` uses an :class:`ArrayBuilder`. .. seealso:: :doc:`/cpp/arrays` for more technical details on :class:`Array` The following code initializes an :class:`ArrayBuilder` for an :class:`Array` that will hold 8 bit -integers. Specifically, it uses the ``AppendValues()`` method, present in concrete +integers. Specifically, it uses the ``AppendValues()`` method, present in concrete :class:`arrow::ArrayBuilder` subclasses, to fill the :class:`ArrayBuilder` with the contents of a standard C++ array. Note the use of :c:macro:`ARROW_RETURN_NOT_OK`. If ``AppendValues()`` fails, this macro will return to ``main()``, which will @@ -115,10 +115,10 @@ print out the meaning of the failure. :start-after: (Doc section: int8builder 1 Append) :end-before: (Doc section: int8builder 1 Append) -Given an :class:`ArrayBuilder` has the values we want in our :class:`Array`, we can use -:func:`ArrayBuilder::Finish` to output the final structure to an :class:`Array` – specifically, +Given an :class:`ArrayBuilder` has the values we want in our :class:`Array`, we can use +:func:`ArrayBuilder::Finish` to output the final structure to an :class:`Array` – specifically, we output to a ``std::shared_ptr``. Note the use of :c:macro:`ARROW_ASSIGN_OR_RAISE` -in the following code. :func:`~ArrayBuilder::Finish` outputs a :class:`arrow::Result` object, which :c:macro:`ARROW_ASSIGN_OR_RAISE` +in the following code. :func:`~ArrayBuilder::Finish` outputs a :class:`arrow::Result` object, which :c:macro:`ARROW_ASSIGN_OR_RAISE` can process. If the method fails, it will return to ``main()`` with a :class:`Status` that will explain what went wrong. If it succeeds, then it will assign the final output to the left-hand variable. @@ -141,7 +141,7 @@ Building int16 Arrays An :class:`ArrayBuilder` has its type specified at the time of declaration. Once this is done, it cannot have its type changed. We have to make a new one when we switch to year data, which -requires a 16-bit integer at the minimum. Of course, there’s an :class:`ArrayBuilder` for that. +requires a 16-bit integer at the minimum. Of course, there's an :class:`ArrayBuilder` for that. It uses the exact same methods, but with the new data type: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc @@ -154,16 +154,16 @@ Now, we have three Arrow :class:`Arrays `, with some variance in t Making a RecordBatch -------------------- -A columnar data format only really comes into play when you have a table. -So, let’s make one. The first kind we’ll make is the :class:`RecordBatch` – this -uses :class:`Arrays ` internally, which means all data will be contiguous within each +A columnar data format only really comes into play when you have a table. +So, let's make one. The first kind we'll make is the :class:`RecordBatch` – this +uses :class:`Arrays ` internally, which means all data will be contiguous within each column, but any appending or concatenating will require copying. Making a :class:`RecordBatch` has two steps, given existing :class:`Arrays `: #. Defining a :class:`Schema` #. Loading the :class:`Schema` and Arrays into the constructor -Defining a Schema +Defining a Schema ^^^^^^^^^^^^^^^^^ To get started making a :class:`RecordBatch`, we first need to define @@ -180,8 +180,8 @@ so: Building a RecordBatch ^^^^^^^^^^^^^^^^^^^^^^ -With data in :class:`Arrays ` from the previous section, and column descriptions in our -:class:`Schema` from the previous step, we can make the :class:`RecordBatch`. Note that the +With data in :class:`Arrays ` from the previous section, and column descriptions in our +:class:`Schema` from the previous step, we can make the :class:`RecordBatch`. Note that the length of the columns is necessary, and the length is shared by all columns. .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc @@ -190,18 +190,18 @@ length of the columns is necessary, and the length is shared by all columns. :end-before: (Doc section: RBatch) Now, we have our data in a nice tabular form, safely within the :class:`RecordBatch`. -What we can do with this will be discussed in the later tutorials. +What we can do with this will be discussed in the later tutorials. Making a ChunkedArray --------------------- -Let’s say that we want an array made up of sub-arrays, because it +Let's say that we want an array made up of sub-arrays, because it can be useful for avoiding data copies when concatenating, for parallelizing work, for fitting each chunk -cutely into cache, or for exceeding the 2,147,483,647 row limit in a +into cache, or for exceeding the 2,147,483,647 row limit in a standard Arrow :class:`Array`. For this, Arrow offers :class:`ChunkedArray`, which can be made up of individual Arrow :class:`Arrays `. In this example, we can reuse the arrays we made earlier in part of our chunked array, allowing us to extend them without having to copy -data. So, let’s build a few more :class:`Arrays `, +data. So, let's build a few more :class:`Arrays `, using the same builders for ease of use: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc @@ -209,7 +209,7 @@ using the same builders for ease of use: :start-after: (Doc section: More Arrays) :end-before: (Doc section: More Arrays) -In order to support an arbitrary amount of :class:`Arrays ` in the construction of the +In order to support an arbitrary amount of :class:`Arrays ` in the construction of the :class:`ChunkedArray`, Arrow supplies :class:`ArrayVector`. This provides a vector for :class:`Arrays `, and we'll use it here to prepare to make a :class:`ChunkedArray`: @@ -233,18 +233,18 @@ for the month and year data: :start-after: (Doc section: ChunkedArray Month Year) :end-before: (Doc section: ChunkedArray Month Year) -With that, we are left with three :class:`ChunkedArrays `, varying in type. +With that, we are left with three :class:`ChunkedArrays `, varying in type. Making a Table -------------- -One particularly useful thing we can do with the :class:`ChunkedArrays ` from the previous section is creating -:class:`Tables `. Much like a :class:`RecordBatch`, a :class:`Table` stores tabular data. However, a +One particularly useful thing we can do with the :class:`ChunkedArrays ` from the previous section is creating +:class:`Tables
`. Much like a :class:`RecordBatch`, a :class:`Table` stores tabular data. However, a :class:`Table` does not guarantee contiguity, due to being made up of :class:`ChunkedArrays `. This can be useful for logic, parallelizing work, for fitting chunks into cache, or exceeding the 2,147,483,647 row limit present in :class:`Array` and, thus, :class:`RecordBatch`. -If you read up to :class:`RecordBatch`, you may note that the :class:`Table` constructor in the following code is +If you read up to :class:`RecordBatch`, you may note that the :class:`Table` constructor in the following code is effectively identical, it just happens to put the length of the columns in position 3, and makes a :class:`Table`. We re-use the :class:`Schema` from before, and make our :class:`Table`: @@ -255,23 +255,23 @@ make our :class:`Table`: :end-before: (Doc section: Table) Now, we have our data in a nice tabular form, safely within the :class:`Table`. -What we can do with this will be discussed in the later tutorials. +What we can do with this will be discussed in the later tutorials. -Ending Program +Ending Program -------------- At the end, we just return :func:`Status::OK()`, so the ``main()`` knows that -we’re done, and that everything’s okay. +we're done, and that everything's okay. .. literalinclude:: ../../../../cpp/examples/tutorial_examples/arrow_example.cc :language: cpp :start-after: (Doc section: Ret) :end-before: (Doc section: Ret) -Wrapping Up +Wrapping Up ----------- -With that, you’ve created the fundamental data structures in Arrow, and +With that, you've created the fundamental data structures in Arrow, and can proceed to getting them in and out of a program with file I/O in the next article. Refer to the below for a copy of the complete code: @@ -281,4 +281,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Basic Example) :end-before: (Doc section: Basic Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index 812212f536169..ef4bf1cf3238d 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -251,7 +251,7 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. -* A ``large_list_view`` array has format string ``+Lv``, and its single +* A ``large_list_view`` array has format string ``+vL``, and its single child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index b858862dcff01..e6f6c3dbbd3d1 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -77,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 642c243b21af0..62bf4b7723558 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas. decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 441f88f39f43a..068bfa84c3144 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -450,6 +450,31 @@ func (c *Client) PrepareSubstrait(ctx context.Context, plan SubstraitPlan, opts return parsePreparedStatementResponse(c, c.Alloc, stream) } +func (c *Client) LoadPreparedStatementFromResult(result *CreatePreparedStatementResult) (*PreparedStatement, error) { + var ( + err error + dsSchema, paramSchema *arrow.Schema + ) + if result.DatasetSchema != nil { + dsSchema, err = flight.DeserializeSchema(result.DatasetSchema, c.Alloc) + if err != nil { + return nil, err + } + } + if result.ParameterSchema != nil { + paramSchema, err = flight.DeserializeSchema(result.ParameterSchema, c.Alloc) + if err != nil { + return nil, err + } + } + return &PreparedStatement{ + client: c, + handle: result.PreparedStatementHandle, + datasetSchema: dsSchema, + paramSchema: paramSchema, + }, nil +} + func parsePreparedStatementResponse(c *Client, mem memory.Allocator, results pb.FlightService_DoActionClient) (*PreparedStatement, error) { if err := results.CloseSend(); err != nil { return nil, err @@ -1027,6 +1052,46 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return p.client.getFlightInfo(ctx, desc, opts...) } +// ExecutePut calls DoPut for the prepared statement on the server. If SetParameters +// has been called then the parameter bindings will be sent before execution. +// +// Will error if already closed. +func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOption) error { + if p.closed { + return errors.New("arrow/flightsql: prepared statement already closed") + } + + cmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + + desc, err := descForCommand(cmd) + if err != nil { + return err + } + + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return err + } + + wr, err := p.writeBindParameters(pstream, desc) + if err != nil { + return err + } + if err = wr.Close(); err != nil { + return err + } + pstream.CloseSend() + + // wait for the server to ack the result + if _, err = pstream.Recv(); err != nil && err != io.EOF { + return err + } + } + + return nil +} + // ExecutePoll executes the prepared statement on the server and returns a PollInfo // indicating the progress of execution. // diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index c8b9f7f1246c1..f35aeefcf4628 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -665,6 +665,36 @@ func (s *FlightSqlClientSuite) TestRenewFlightEndpoint() { s.Equal(&mockedRenewedEndpoint, renewedEndpoint) } +func (s *FlightSqlClientSuite) TestPreparedStatementLoadFromResult() { + const query = "query" + + result := &pb.ActionCreatePreparedStatementResult{ + PreparedStatementHandle: []byte(query), + } + + parameterSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "p_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.ParameterSchema = flight.SerializeSchema(parameterSchemaResult, memory.DefaultAllocator) + datasetSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "ds_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.DatasetSchema = flight.SerializeSchema(datasetSchemaResult, memory.DefaultAllocator) + + prepared, err := s.sqlClient.LoadPreparedStatementFromResult(result) + s.NoError(err) + + s.Equal(string(prepared.Handle()), "query") + + paramSchema := prepared.ParameterSchema() + paramRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, paramSchema, strings.NewReader(`[{"p_id": 1}]`)) + s.NoError(err) + defer paramRec.Release() + + datasetSchema := prepared.DatasetSchema() + datasetRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, datasetSchema, strings.NewReader(`[{"ds_id": 1}]`)) + s.NoError(err) + defer datasetRec.Release() + + s.Equal(string(prepared.Handle()), "query") +} + func TestFlightSqlClient(t *testing.T) { suite.Run(t, new(FlightSqlClientSuite)) } diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index d89e68f028bb8..c70a8bdc4ec26 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -852,3 +852,5 @@ const ( // cancellation request. CancelResultNotCancellable = pb.ActionCancelQueryResult_CANCEL_RESULT_NOT_CANCELLABLE ) + +type CreatePreparedStatementResult = pb.ActionCreatePreparedStatementResult diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 8723fafa8dadd..4c302ea59dbe3 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -26,7 +26,7 @@ ../../../cpp/release-build/ 2.5.0 - 1.11.0 + 1.13.1 1.11.3 diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ArrowMessage.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ArrowMessage.java index 46cb282e9f3ce..5b946932f39f2 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ArrowMessage.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ArrowMessage.java @@ -429,11 +429,26 @@ private InputStream asInputStream() { ByteBuf initialBuf = Unpooled.buffer(baos.size()); initialBuf.writeBytes(baos.toByteArray()); final CompositeByteBuf bb; - final int maxNumComponents = Math.max(2, bufs.size() + 1); final ImmutableList byteBufs = ImmutableList.builder() .add(initialBuf) .addAll(allBufs) .build(); + // See: https://github.com/apache/arrow/issues/40039 + // CompositeByteBuf requires us to pass maxNumComponents to constructor. + // This number will be used to decide when to stop adding new components as separate buffers + // and instead merge existing components into a new buffer by performing a memory copy. + // We want to avoind memory copies as much as possible so we want to set the limit that won't be reached. + // At a first glance it seems reasonable to set limit to byteBufs.size() + 1, + // because it will be enough to avoid merges of byteBufs that we pass to constructor. + // But later this buffer will be written to socket by Netty + // and DefaultHttp2ConnectionEncoder uses CoalescingBufferQueue to combine small buffers into one. + // Method CoalescingBufferQueue.compose will check if current buffer is already a CompositeByteBuf + // and if it's the case it will just add a new component to this buffer. + // But in out case if we set maxNumComponents=byteBufs.size() + 1 it will happen on the first attempt + // to write data to socket because header message is small and Netty will always try to compine it with the + // large CompositeByteBuf we're creating here. + // We never want additional memory copies so setting the limit to Integer.MAX_VALUE + final int maxNumComponents = Integer.MAX_VALUE; if (tryZeroCopyWrite) { bb = new ArrowBufRetainingCompositeByteBuf(maxNumComponents, byteBufs, bufs); } else { diff --git a/java/performance/pom.xml b/java/performance/pom.xml index ba5a6616dca77..d572876e724a5 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -81,7 +81,7 @@ UTF-8 - 1.21 + 1.37 1.8 benchmarks true diff --git a/java/pom.xml b/java/pom.xml index 6442987f5a192..accff4c9b9f69 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -484,7 +484,7 @@ maven-failsafe-plugin - 3.0.0-M7 + 3.2.5 ${project.build.directory} @@ -750,7 +750,7 @@ junit junit - 4.13.1 + 4.13.2 test diff --git a/python/examples/minimal_build/Dockerfile.fedora b/python/examples/minimal_build/Dockerfile.fedora index cc3d62bec0ebe..e7b9600b67b0e 100644 --- a/python/examples/minimal_build/Dockerfile.fedora +++ b/python/examples/minimal_build/Dockerfile.fedora @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM fedora:35 +FROM fedora:39 RUN dnf update -y && \ dnf install -y \ diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4dbd1258d3cea..2ee97ddb662e5 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,8 @@ def print_entry(label, value): binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +175,9 @@ def print_entry(label, value): field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,8 +203,9 @@ def print_entry(label, value): Int32Array, UInt32Array, Int64Array, UInt64Array, HalfFloatArray, FloatArray, DoubleArray, - ListArray, LargeListArray, MapArray, - FixedSizeListArray, UnionArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, BinaryViewArray, StringViewArray, @@ -220,6 +223,7 @@ def print_entry(label, value): HalfFloatScalar, FloatScalar, DoubleScalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index a2ff045f256ac..67ee7590560f0 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -2013,8 +2013,9 @@ cdef CStatus _data_stream_next(void* self, CFlightPayload* payload) except *: max_attempts = 128 for _ in range(max_attempts): if stream.current_stream != nullptr: - check_flight_status( - stream.current_stream.get().Next().Value(payload)) + with nogil: + check_flight_status( + stream.current_stream.get().Next().Value(payload)) # If the stream ended, see if there's another stream from the # generator if payload.ipc_message.metadata != nullptr: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 0b685245655a2..7bc68a288aa78 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -849,6 +849,13 @@ cdef class FileMetaData(_Weakrefable): cdef Buffer buffer = sink.getvalue() return _reconstruct_filemetadata, (buffer,) + def __hash__(self): + return hash((self.schema, + self.num_rows, + self.num_row_groups, + self.format_version, + self.serialized_size)) + def __repr__(self): return """{0} created_by: {1} @@ -1071,6 +1078,9 @@ cdef class ParquetSchema(_Weakrefable): def __getitem__(self, i): return self.column(i) + def __hash__(self): + return hash(self.schema.ToString()) + @property def names(self): """Name of each field (list of str).""" diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 48d02d9661e9b..e1bf494920566 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2524,6 +2524,578 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) +cdef class ListViewArray(Array): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this ListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + +cdef class LargeListViewArray(Array): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this LargeListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + cdef class MapArray(ListArray): """ Concrete class for Arrow arrays of a map data type. @@ -3605,7 +4177,7 @@ cdef class ExtensionArray(Array): return result -class FixedShapeTensorArray(ExtensionArray): +cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. @@ -3646,17 +4218,48 @@ class FixedShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): """ - Convert fixed shape tensor extension array to a numpy array (with dim+1). + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. - Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. """ - if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): - np_flat = np.asarray(self.storage.flatten()) - numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) - return numpy_tensor - else: - raise ValueError( - 'Only non-permuted tensors can be converted to numpy tensors.') + + cdef: + CFixedShapeTensorArray* ext_array = (self.ap) + CResult[shared_ptr[CTensor]] ctensor + with nogil: + ctensor = ext_array.ToTensor() + return pyarrow_wrap_tensor(GetResultValue(ctensor)) @staticmethod def from_numpy_ndarray(obj): @@ -3664,9 +4267,7 @@ class FixedShapeTensorArray(ExtensionArray): Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. The first dimension of ndarray will become the length of the fixed shape tensor array. - - Numpy array needs to be C-contiguous in memory - (``obj.flags["C_CONTIGUOUS"]==True``). + If input array data is not contiguous a copy will be made. Parameters ---------- @@ -3700,17 +4301,25 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if not obj.flags["C_CONTIGUOUS"]: - raise ValueError('The data in the numpy array need to be in a single, ' - 'C-style contiguous segment.') + + if len(obj.shape) < 2: + raise ValueError( + "Cannot convert 1D array or scalar to fixed shape tensor array") + if np.prod(obj.shape) == 0: + raise ValueError("Expected a non-empty ndarray") + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') arrow_type = from_numpy_dtype(obj.dtype) - shape = obj.shape[1:] - size = obj.size / obj.shape[0] + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") return ExtensionArray.from_storage( - fixed_shape_tensor(arrow_type, shape), - FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) + fixed_shape_tensor(arrow_type, shape[1:], permutation=permutation[1:] - 1), + FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) @@ -3737,6 +4346,8 @@ cdef dict _array_classes = { _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, _Type_SPARSE_UNION: UnionArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 35a9ff5569d2f..95ee0e95e34c0 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,6 +132,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" + _Type_LIST_VIEW" arrow::Type::LIST_VIEW" + _Type_LARGE_LIST_VIEW" arrow::Type::LARGE_LIST_VIEW" _Type_STRUCT" arrow::Type::STRUCT" _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" _Type_DENSE_UNION" arrow::Type::DENSE_UNION" @@ -372,6 +374,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CListViewType" arrow::ListViewType"(CDataType): + CListViewType(const shared_ptr[CDataType]& value_type) + CListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + + cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): + CLargeListViewType(const shared_ptr[CDataType]& value_type) + CLargeListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, const shared_ptr[CField]& item_field, c_bool keys_sorted) @@ -491,6 +505,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] run_end_type, shared_ptr[CDataType] value_type) + cdef shared_ptr[CDataType] CMakeListViewType" arrow::list_view"( + shared_ptr[CField] value_type) + + cdef shared_ptr[CDataType] CMakeLargeListViewType" arrow::large_list_view"( + shared_ptr[CField] value_type) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -696,6 +716,70 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + const int32_t* raw_value_offsets() + const int32_t* raw_value_sizes() + int32_t value_offset(int i) + int32_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + + cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( @@ -1156,6 +1240,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CListScalar" arrow::ListScalar"(CBaseListScalar): pass + cdef cppclass CListViewScalar" arrow::ListViewScalar"(CBaseListScalar): + pass + + cdef cppclass CLargeListViewScalar" arrow::LargeListViewScalar"(CBaseListScalar): + pass + cdef cppclass CMapScalar" arrow::MapScalar"(CListScalar): pass @@ -2710,26 +2800,26 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): + CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const + @staticmethod CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, const vector[int64_t]& shape, const vector[int64_t]& permutation, const vector[c_string]& dim_names) - CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, - const c_string& serialized_data) const - - c_string Serialize() const - const shared_ptr[CDataType] value_type() const vector[int64_t] shape() const vector[int64_t] permutation() const vector[c_string] dim_names() + cdef cppclass CFixedShapeTensorArray \ + " arrow::extension::FixedShapeTensorArray"(CExtensionArray): + const CResult[shared_ptr[CTensor]] ToTensor() const cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": @@ -2849,6 +2939,9 @@ cdef extern from "arrow/c/bridge.h" namespace "arrow" nogil: CResult[shared_ptr[CRecordBatchReader]] ImportRecordBatchReader( ArrowArrayStream*) + CStatus ExportChunkedArray(shared_ptr[CChunkedArray], ArrowArrayStream*) + CResult[shared_ptr[CChunkedArray]] ImportChunkedArray(ArrowArrayStream*) + CStatus ExportDeviceArray(const CArray&, shared_ptr[CSyncEvent], ArrowDeviceArray* out, ArrowSchema*) CResult[shared_ptr[CArray]] ImportDeviceArray( @@ -2863,6 +2956,8 @@ cdef extern from "arrow/c/bridge.h" namespace "arrow" nogil: CResult[shared_ptr[CRecordBatch]] ImportDeviceRecordBatch( ArrowDeviceArray*, ArrowSchema*) + + cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: CResult[int64_t] ReferencedBufferSize(const CArray& array_data) CResult[int64_t] ReferencedBufferSize(const CRecordBatch& record_batch) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c1104864066e9..48350212c2076 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -120,6 +120,16 @@ cdef class LargeListType(DataType): const CLargeListType* list_type +cdef class ListViewType(DataType): + cdef: + const CListViewType* list_view_type + + +cdef class LargeListViewType(DataType): + cdef: + const CLargeListViewType* list_view_type + + cdef class MapType(DataType): cdef: const CMapType* map_type @@ -425,6 +435,14 @@ cdef class LargeListArray(BaseListArray): pass +cdef class ListViewArray(Array): + pass + + +cdef class LargeListViewArray(Array): + pass + + cdef class MapArray(ListArray): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index b0368b67f790e..3245e50f0fe69 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -110,6 +110,8 @@ Type_BINARY_VIEW = _Type_BINARY_VIEW Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST +Type_LIST_VIEW = _Type_LIST_VIEW +Type_LARGE_LIST_VIEW = _Type_LARGE_LIST_VIEW Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec387..966273b4bea84 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -87,6 +87,10 @@ cdef api object pyarrow_wrap_data_type( out = ListType.__new__(ListType) elif type.get().id() == _Type_LARGE_LIST: out = LargeListType.__new__(LargeListType) + elif type.get().id() == _Type_LIST_VIEW: + out = ListViewType.__new__(ListViewType) + elif type.get().id() == _Type_LARGE_LIST_VIEW: + out = LargeListViewType.__new__(LargeListViewType) elif type.get().id() == _Type_MAP: out = MapType.__new__(MapType) elif type.get().id() == _Type_FIXED_SIZE_LIST: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2772acf81861c..41bfde39adb6f 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -720,6 +720,14 @@ cdef class LargeListScalar(ListScalar): pass +cdef class ListViewScalar(ListScalar): + pass + + +cdef class LargeListViewScalar(ListScalar): + pass + + cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. @@ -1035,6 +1043,48 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +cdef class FixedShapeTensorScalar(ExtensionScalar): + """ + Concrete class for fixed shape tensor extension scalar. + """ + + def to_numpy(self): + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + cdef: + CFixedShapeTensorType* c_type = static_pointer_cast[CFixedShapeTensorType, CDataType]( + self.wrapped.get().type).get() + shared_ptr[CExtensionScalar] scalar = static_pointer_cast[CExtensionScalar, CScalar](self.wrapped) + shared_ptr[CTensor] ctensor + + with nogil: + ctensor = GetResultValue(c_type.MakeTensor(scalar)) + return pyarrow_wrap_tensor(ctensor) + + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, @@ -1066,6 +1116,8 @@ cdef dict _scalar_classes = { _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, + _Type_LIST_VIEW: ListViewScalar, + _Type_LARGE_LIST_VIEW: LargeListViewScalar, _Type_STRUCT: StructScalar, _Type_MAP: MapScalar, _Type_DICTIONARY: DictionaryScalar, diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index b92b92147e4dc..feaa9a960f928 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1327,6 +1327,67 @@ cdef class ChunkedArray(_PandasConvertible): result += self.chunk(i).to_pylist() return result + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + cdef: + ArrowArrayStream* c_stream = NULL + + if requested_schema is not None: + out_type = DataType._import_from_c_capsule(requested_schema) + if self.type != out_type: + raise NotImplementedError("Casting to requested_schema") + + stream_capsule = alloc_c_stream(&c_stream) + + with nogil: + check_status(ExportChunkedArray(self.sp_chunked_array, c_stream)) + + return stream_capsule + + @staticmethod + def _import_from_c_capsule(stream): + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + cdef: + ArrowArrayStream* c_stream + shared_ptr[CChunkedArray] c_chunked_array + ChunkedArray self + + c_stream = PyCapsule_GetPointer( + stream, 'arrow_array_stream' + ) + + with nogil: + c_chunked_array = GetResultValue(ImportChunkedArray(c_stream)) + + self = ChunkedArray.__new__(ChunkedArray) + self.init(c_chunked_array) + return self + def chunked_array(arrays, type=None): """ diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 73284d2e53b9e..bf186bd923c4f 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -499,6 +499,32 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 +def test_metadata_hashing(tempdir): + path1 = str(tempdir / "metadata1") + schema1 = pa.schema([("a", "int64"), ("b", "float64")]) + pq.write_metadata(schema1, path1) + parquet_meta1 = pq.read_metadata(path1) + + # Same as 1, just different path + path2 = str(tempdir / "metadata2") + schema2 = pa.schema([("a", "int64"), ("b", "float64")]) + pq.write_metadata(schema2, path2) + parquet_meta2 = pq.read_metadata(path2) + + # different schema + path3 = str(tempdir / "metadata3") + schema3 = pa.schema([("a", "int64"), ("b", "float32")]) + pq.write_metadata(schema3, path3) + parquet_meta3 = pq.read_metadata(path3) + + # Deterministic + assert hash(parquet_meta1) == hash(parquet_meta1) # equal w/ same instance + assert hash(parquet_meta1) == hash(parquet_meta2) # equal w/ different instance + + # Not the same as other metadata with different schema + assert hash(parquet_meta1) != hash(parquet_meta3) + + @pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning") def test_write_metadata(tempdir): path = str(tempdir / "metadata") diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f851d4e0b6c29..bd9ae214b041e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3573,3 +3573,74 @@ def test_run_end_encoded_from_buffers(): with pytest.raises(ValueError): pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers, 1, offset, children) + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_from_arrays(list_array_type): + # test in order offsets, similar to ListArray representation + values = [1, 2, 3, 4, 5, 6, None, 7] + offsets = [0, 2, 4, 6] + sizes = [2, 2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[1, 2], [3, 4], [5, 6], [None, 7]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test out of order offsets with overlapping values + values = [1, 2, 3, 4] + offsets = [2, 1, 0] + sizes = [2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[3, 4], [2, 3], [1, 2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test null offsets and empty list values + values = [] + offsets = [0, None] + sizes = [0, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[], None] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == [0, 0] + assert array.sizes.to_pylist() == sizes + + # test null sizes and empty list values + values = [] + offsets = [0, 0] + sizes = [None, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [None, []] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == [0, 0] + + # test null bitmask + values = [1, 2] + offsets = [0, 0, 1] + sizes = [1, 0, 1] + mask = pa.array([False, True, False]) + array = list_array_type.from_arrays(offsets, sizes, values, mask=mask) + + assert array.to_pylist() == [[1], None, [2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_flatten(list_array_type): + values = [1, 2, 3, 4] + offsets = [3, 2, 1, 0] + sizes = [1, 1, 1, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.flatten().to_pylist() == [4, 3, 2, 1] diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 1bb4d5e2e5074..c1cd6c5a7c339 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -603,6 +603,32 @@ def test_roundtrip_batch_reader_capsule(): imported_reader.read_next_batch() +def test_roundtrip_chunked_array_capsule(): + chunked = pa.chunked_array([pa.array(["a", "b", "c"])]) + + capsule = chunked.__arrow_c_stream__() + assert PyCapsule_IsValid(capsule, b"arrow_array_stream") == 1 + imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) + assert imported_chunked.type == chunked.type + assert imported_chunked == chunked + + +def test_roundtrip_chunked_array_capsule_requested_schema(): + chunked = pa.chunked_array([pa.array(["a", "b", "c"])]) + + # Requesting the same type should work + requested_capsule = chunked.type.__arrow_c_schema__() + capsule = chunked.__arrow_c_stream__(requested_capsule) + imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) + assert imported_chunked == chunked + + # Casting to something else should error + requested_type = pa.binary() + requested_capsule = requested_type.__arrow_c_schema__() + with pytest.raises(NotImplementedError): + chunked.__arrow_c_stream__(requested_capsule) + + @needs_cffi def test_export_import_device_array(): c_schema = ffi.new("struct ArrowSchema*") diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index d8c792ef00c6b..fe38bf651baae 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1318,39 +1318,120 @@ def test_tensor_type(): assert tensor_type.permutation is None -def test_tensor_class_methods(): - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], - pa.list_(pa.float32(), 6)) +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_class_methods(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) + storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr.to_numpy_ndarray() + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type) + np.testing.assert_array_equal(arr.to_tensor(), expected) + np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) + + expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type) + result = arr[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr[:1].to_numpy_ndarray() + values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] + flat_arr = np.array(values[0], dtype=value_type) + bw = value_type.itemsize + storage = pa.array(values, pa.list_(arrow_type, 12)) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + result = flat_arr.reshape(1, 2, 3, 2) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) np.testing.assert_array_equal(result, expected) - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="C") + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 6, bw * 2)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 0, 1] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + + +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_array_from_numpy(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) - assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.value_type == arrow_type assert tensor_array_from_numpy.type.shape == [2, 3] - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="F") - with pytest.raises(ValueError, match="C-style contiguous segment"): + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match="First stride needs to be largest"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) - storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) - arr = pa.ExtensionArray.from_storage(tensor_type, storage) - with pytest.raises(ValueError, match="non-permuted tensors"): - arr.to_numpy_ndarray() + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [3, 4] + assert tensor_array_from_numpy.type.permutation == [0, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [2, 2, 3] + assert tensor_array_from_numpy.type.permutation == [0, 2, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type) + expected = arr[1:] + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) + + arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array(1, dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array([], dtype=value_type) + + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0, 3, 2))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2))) @pytest.mark.parametrize("tensor_type", ( diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index d9fdeb152c35e..4f199952344f2 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -243,13 +243,35 @@ def test_debug_memory_pool_warn(pool_factory): assert "Wrong size on deallocation" in res.stderr -@pytest.mark.parametrize('pool_factory', supported_factories()) -def test_debug_memory_pool_disabled(pool_factory): - res = run_debug_memory_pool(pool_factory.__name__, "") +def check_debug_memory_pool_disabled(pool_factory, env_value, msg): + res = run_debug_memory_pool(pool_factory.__name__, env_value) # The subprocess either returned successfully or was killed by a signal # (due to writing out of bounds), depending on the underlying allocator. if os.name == "posix": assert res.returncode <= 0 else: res.check_returncode() - assert res.stderr == "" + if msg == "": + assert res.stderr == "" + else: + assert msg in res.stderr + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_none(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "none", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_empty(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_unknown(pool_factory): + env_value = "some_arbitrary_value" + msg = ( + f"Invalid value for ARROW_DEBUG_MEMORY_POOL: '{env_value}'. " + "Valid values are 'abort', 'trap', 'warn', 'none'." + ) + check_debug_memory_pool_disabled(pool_factory, env_value, msg) diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8cec8783280dd..39dac4eb81dfb 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -154,6 +154,8 @@ def test_set_timezone_db_path_non_windows(): pa.ListType, pa.LargeListType, pa.FixedSizeListType, + pa.ListViewType, + pa.LargeListViewType, pa.UnionType, pa.SparseUnionType, pa.DenseUnionType, @@ -227,6 +229,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, + pa.ListViewScalar, + pa.LargeListViewScalar, pa.MapScalar, pa.FixedSizeListScalar, pa.UnionScalar, diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 8106219057efe..676cc96151161 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3650,7 +3650,8 @@ def test_singleton_blocks_zero_copy(): prior_allocation = pa.total_allocated_bytes() result = t.to_pandas() - assert result['f0'].values.flags.writeable + # access private `_values` because the public `values` is made read-only by pandas + assert result['f0']._values.flags.writeable assert pa.total_allocated_bytes() > prior_allocation diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index eed5f045be945..074fb757e265a 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -57,6 +57,9 @@ ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar), + # TODO GH-39855 + # ([1, 2, 3], pa.list_view(pa.int8()), pa.ListViewScalar), + # ([1, 2, 3, 4], pa.large_list_view(pa.int8()), pa.LargeListViewScalar), (datetime.date.today(), None, pa.Date32Scalar), (datetime.date.today(), pa.date64(), pa.Date64Scalar), (datetime.datetime.now(), None, pa.TimestampScalar), @@ -537,7 +540,10 @@ def test_fixed_size_binary(): @pytest.mark.parametrize(('ty', 'klass'), [ (pa.list_(pa.string()), pa.ListScalar), - (pa.large_list(pa.string()), pa.LargeListScalar) + (pa.large_list(pa.string()), pa.LargeListScalar), + # TODO GH-39855 + # (pa.list_view(pa.string()), pa.ListViewScalar), + # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): v = ['foo', None] diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index a5ab3128dc874..0add5786088d3 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -66,6 +66,8 @@ def get_many_types(): pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.uint16()), pa.map_(pa.string(), pa.int32()), pa.map_(pa.field('key', pa.int32(), nullable=False), pa.field('value', pa.int32())), @@ -169,6 +171,18 @@ def test_is_list(): assert not types.is_list(pa.int32()) +def test_is_list_view(): + a = pa.list_view(pa.int32()) + b = pa.large_list_view(pa.int32()) + + assert types.is_list_view(a) + assert not types.is_large_list_view(a) + assert not types.is_list(a) + assert types.is_large_list_view(b) + assert not types.is_list_view(b) + assert not types.is_large_list(b) + + def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) @@ -573,6 +587,41 @@ def test_large_list_type(): pa.large_list(None) +def test_list_view_type(): + ty = pa.list_view(pa.int64()) + assert isinstance(ty, pa.ListViewType) + assert ty.value_type == pa.int64() + assert ty.value_field == pa.field("item", pa.int64(), nullable=True) + + # nullability matters in comparison + ty_non_nullable = pa.list_view(pa.field("item", pa.int64(), nullable=False)) + assert ty != ty_non_nullable + + # field names don't matter by default + ty_named = pa.list_view(pa.field("element", pa.int64())) + assert ty == ty_named + assert not ty.equals(ty_named, check_metadata=True) + + # metadata doesn't matter by default + ty_metadata = pa.list_view( + pa.field("item", pa.int64(), metadata={"hello": "world"})) + assert ty == ty_metadata + assert not ty.equals(ty_metadata, check_metadata=True) + + with pytest.raises(TypeError): + pa.list_view(None) + + +def test_large_list_view_type(): + ty = pa.large_list_view(pa.utf8()) + assert isinstance(ty, pa.LargeListViewType) + assert ty.value_type == pa.utf8() + assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) + + with pytest.raises(TypeError): + pa.large_list_view(None) + + def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ce3736b5af847..e9bf56c6213f6 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -15,7 +15,13 @@ # specific language governing permissions and limitations # under the License. -from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New, PyCapsule_IsValid +from cpython.pycapsule cimport ( + PyCapsule_CheckExact, + PyCapsule_GetPointer, + PyCapsule_GetName, + PyCapsule_New, + PyCapsule_IsValid +) import atexit from collections.abc import Mapping @@ -105,6 +111,7 @@ cdef void* _as_c_pointer(v, allow_null=False) except *: (the latter for compatibility with raw pointers exported by reticulate) """ cdef void* c_ptr + cdef const char* capsule_name if isinstance(v, int): c_ptr = v elif isinstance(v, float): @@ -114,7 +121,20 @@ cdef void* _as_c_pointer(v, allow_null=False) except *: "Arrow library", UserWarning, stacklevel=2) c_ptr = v elif PyCapsule_CheckExact(v): - c_ptr = PyCapsule_GetPointer(v, NULL) + # An R external pointer was how the R bindings passed pointer values to + # Python from versions 7 to 15 (inclusive); however, the reticulate 1.35.0 + # update changed the name of the capsule from NULL to "r_extptr". + # Newer versions of the R package pass a Python integer; however, this + # workaround ensures that old versions of the R package continue to work + # with newer versions of pyarrow. + capsule_name = PyCapsule_GetName(v) + if capsule_name == NULL or capsule_name == b"r_extptr": + c_ptr = PyCapsule_GetPointer(v, capsule_name) + else: + capsule_name_str = capsule_name.decode() + raise ValueError( + f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address" + ) else: raise TypeError(f"Expected a pointer value, got {type(v)!r}") if not allow_null and c_ptr == NULL: @@ -557,6 +577,101 @@ cdef class LargeListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class ListViewType(DataType): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + +cdef class LargeListViewType(DataType): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return large_list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + cdef class MapType(DataType): """ Concrete class for map data types. @@ -1658,20 +1773,6 @@ cdef class FixedShapeTensorType(BaseExtensionType): else: return None - def __arrow_ext_serialize__(self): - """ - Serialized representation of metadata to reconstruct the type object. - """ - return self.tensor_ext_type.Serialize() - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - """ - Return an FixedShapeTensor type instance from the storage type and serialized - metadata. - """ - return self.tensor_ext_type.Deserialize(storage_type, serialized) - def __arrow_ext_class__(self): return FixedShapeTensorArray @@ -1679,6 +1780,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): return fixed_shape_tensor, (self.value_type, self.shape, self.dim_names, self.permutation) + def __arrow_ext_scalar_class__(self): + return FixedShapeTensorScalar + _py_extension_type_auto_load = False @@ -4528,6 +4632,82 @@ cpdef LargeListType large_list(value_type): return out +cpdef ListViewType list_view(value_type): + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('ListView requires DataType or Field') + + list_view_type = CMakeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + +cpdef LargeListViewType large_list_view(value_type): + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('LargeListView requires DataType or Field') + + list_view_type = CMakeLargeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + cpdef MapType map_(key_type, item_type, keys_sorted=False): """ Create MapType instance from key and item data types or fields. @@ -4976,8 +5156,9 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N cdef FixedShapeTensorType out = FixedShapeTensorType.__new__(FixedShapeTensorType) - c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( - value_type.sp_type, c_shape, c_permutation, c_dim_names)) + with nogil: + c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) out.init(c_tensor_ext_type) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 32398dac9c5f5..0f68ca9fe574b 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -151,6 +151,16 @@ def is_fixed_size_list(t): return t.id == lib.Type_FIXED_SIZE_LIST +@doc(is_null, datatype="list view") +def is_list_view(t): + return t.id == lib.Type_LIST_VIEW + + +@doc(is_null, datatype="large list view") +def is_large_list_view(t): + return t.id == lib.Type_LARGE_LIST_VIEW + + @doc(is_null, datatype="struct") def is_struct(t): return t.id == lib.Type_STRUCT diff --git a/r/PACKAGING.md b/r/PACKAGING.md index 7f42ecf562e59..4edeb4f2130cc 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -26,6 +26,7 @@ For a high-level overview of the release process see the ## Before the release candidate is cut - [ ] [Create a GitHub issue](https://github.com/apache/arrow/issues/new/) entitled `[R] CRAN packaging checklist for version X.X.X` and copy this checklist to the issue. +- [ ] Review deprecated functions to advance their deprecation status, including removing preprocessor directives that no longer apply (search for `ARROW_VERSION_MAJOR` in r/src). - [ ] Evaluate the status of any failing [nightly tests and nightly packaging builds](http://crossbow.voltrondata.com). These checks replicate most of the checks that CRAN runs, so we need them all to be passing or to understand that the failures may (though won't necessarily) result in a rejection from CRAN. - [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_arrow.html) - [ ] Ensure the contents of the README are accurate and up to date. diff --git a/r/R/python.R b/r/R/python.R index 023d914f16a9e..1159806bf7c25 100644 --- a/r/R/python.R +++ b/r/R/python.R @@ -339,15 +339,9 @@ install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) { } pyarrow_compatible_pointer <- function(ptr) { - pa <- reticulate::import("pyarrow") - version_string <- pa$`__version__` - # remove trailing .devXXX because it won't work with package_version() - pyarrow_version <- package_version(gsub("\\.dev.*?$", "", version_string)) - - # pyarrow pointers changed in version 7.0.0 - if (pyarrow_version >= "7.0.0") { - return(ptr) - } else { - return(external_pointer_addr_double(ptr)) - } + # GH-39933: Workaround because there is no built-in way to send a + # 64-bit integer to Python from an R object + py <- reticulate::import_builtins(convert = FALSE) + addr <- external_pointer_addr_character(ptr) + py$int(addr) } diff --git a/r/configure.win b/r/configure.win index 2d9e5cdf54e44..b6ac19faea2d4 100755 --- a/r/configure.win +++ b/r/configure.win @@ -17,33 +17,58 @@ # specific language governing permissions and limitations # under the License. +: ${PKG_CONFIG:="pkg-config"} +# Library settings +PKG_CONFIG_NAME="arrow" +PKG_TEST_HEADER="" + +VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` + +# Development mode, also increases verbosity in the bundled build +ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'` +# If present, `pkg-config` will be used to find libarrow on the system, +# unless this is set to false +ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` # generate code -if [ "$ARROW_R_DEV" == "TRUE" ]; then +if [ "$ARROW_R_DEV" == "true" ]; then echo "*** Generating code with data-raw/codegen.R" "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" data-raw/codegen.R fi -OPENSSL_LIBS="-lcrypto -lcrypt32" -MIMALLOC_LIBS="-lbcrypt -lpsapi" -BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it -AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ - -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ - -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ - -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" -# pkg-config --libs libcurl -GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ - -lz -lws2_32 -lnghttp2 -ldbghelp" +# Test if pkg-config is available to use +if ${PKG_CONFIG} --version >/dev/null 2>&1; then + PKG_CONFIG_AVAILABLE="true" + echo "*** pkg-config found." +else + echo "*** pkg-config not found." + PKG_CONFIG_AVAILABLE="false" + ARROW_USE_PKG_CONFIG="false" +fi -function configure_release() { - VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) + +function configure_binaries() { # Try to find/download a C++ Arrow binary, "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/nixlibs.R" $VERSION # If binary not found, script exits nonzero if [ $? -ne 0 ]; then + _LIBARROW_FOUND="false" echo "Arrow C++ library was not found" + # return 0 so set -e doesn't exit the script + return 0 fi + OPENSSL_LIBS="-lcrypto -lcrypt32" + MIMALLOC_LIBS="-lbcrypt -lpsapi" + BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it + AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ + -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ + -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ + -luserenv -lversion -lws2_32 -lbcrypt -lwininet -lwinhttp" + # pkg-config --libs libcurl + GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ + -lz -lws2_32 -lnghttp2 -ldbghelp" + # Set the right flags to point to and enable arrow/parquet if [ -d "windows/arrow-$VERSION" ]; then RWINLIB="../windows/arrow-$VERSION" @@ -75,12 +100,160 @@ function configure_release() { # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" fi + +} + +# Once libarrow is obtained, this function sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` +# either from pkg-config or by inferring things about the directory in $1 +set_pkg_vars () { + set_lib_dir_with_pc + + # Check cmake options for enabled features. This uses LIB_DIR that + # is set by the above set_lib_dir_* call. + add_feature_flags + set_pkg_vars_with_pc + + # Set any user-defined CXXFLAGS + if [ "$ARROW_R_CXXFLAGS" ]; then + PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" + fi + + # We use expr because the product version returns more than just 10.13 and we want to + # match the substring. However, expr always outputs the number of matched characters + # to stdout, to avoid noise in the log we redirect the output to /dev/null + if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then + # avoid C++17 availability warnings on macOS < 11 + PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" + fi +} + +# If we have pkg-config, it will tell us what libarrow needs +set_lib_dir_with_pc () { + LIB_DIR="`${PKG_CONFIG} --variable=libdir ${PKG_CONFIG_NAME}`" +} +set_pkg_vars_with_pc () { + pkg_config_names="${PKG_CONFIG_NAME} ${PKG_CONFIG_NAMES_FEATURES}" + PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" + PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" + PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" + PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` +} + +add_feature_flags () { + PKG_CFLAGS_FEATURES="" + PKG_CONFIG_NAMES_FEATURES="" + PKG_LIBS_FEATURES="" + PKG_LIBS_FEATURES_WITHOUT_PC="" + + # Now we need to check what features it was built with and enable + # the corresponding feature flags in the R bindings (-DARROW_R_WITH_stuff). + # We do this by inspecting ArrowOptions.cmake, which the libarrow build + # generates. + ARROW_OPTS_CMAKE="$LIB_DIR/cmake/Arrow/ArrowOptions.cmake" + if [ ! -f "${ARROW_OPTS_CMAKE}" ]; then + echo "*** $ARROW_OPTS_CMAKE not found; some features will not be enabled" + else + if arrow_built_with ARROW_PARQUET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_PARQUET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES parquet" + PKG_LIBS_FEATURES_WITHOUT_PC="-lparquet $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: parquet is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_DATASET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_DATASET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-dataset" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_dataset $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_dataset is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_ACERO; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_ACERO" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-acero" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_acero $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_acero is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_SUBSTRAIT; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_SUBSTRAIT" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-substrait" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_substrait $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_substrait is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_JSON; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_JSON" + fi + if arrow_built_with ARROW_S3; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" + fi + if arrow_built_with ARROW_GCS; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" + fi + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + # If pkg-config is available it will handle this for us automatically + SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" + fi + fi +} + + +arrow_built_with() { + # Function to check cmake options for features + grep -i 'set('"$1"' "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 +} + +function configure_rtools() { + # Use pkg-config to find arrow from rtools + _LIBARROW_PREFIX="`${PKG_CONFIG} --variable=prefix ${PKG_CONFIG_NAME}`" + _LIBARROW_FOUND="true" + echo "*** Trying Arrow C++ found by pkg-config: $_LIBARROW_PREFIX" + + PC_LIB_VERSION=`${PKG_CONFIG} --modversion ${PKG_CONFIG_NAME}` + # This is in an R script for convenience and testability. + # Success means the found C++ library is ok to use. + # Error means the versions don't line up and we shouldn't use it. + # More specific messaging to the user is in the R script + if ! ${R_HOME}/bin/Rscript tools/check-versions.R $VERSION $PC_LIB_VERSION 2> /dev/null; then + _LIBARROW_FOUND="false" + fi + + # We should have a valid libarrow build in $_LIBARROW_FOUND +# Now set `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` based on that. +if [ "$_LIBARROW_FOUND" == "true" ]; then + set_pkg_vars ${_LIBARROW_PREFIX} + # add mingw specific windows flags + PKG_LIBS="$PKG_LIBS -lws2_32 -lole32 -lwldap32 -lsecur32 -lncrypt -lcrypt32 -lshlwapi" + # override -fno-exceptions from aws-cpp-sdk pc file + PKG_CFLAGS="$PKG_CFLAGS -fexceptions" +else + # To make it easier to debug which code path was taken add a specific + # message to the log in addition to the 'NOTE' + echo "*** Failed to find Arrow C++ libraries in rtools" +fi +} + +function configure_release() { + if [ "$ARROW_USE_PKG_CONFIG" != "false" ] && $PKG_CONFIG --exists $PKG_CONFIG_NAME; then + configure_rtools + else + configure_binaries + fi + + if [ "$_LIBARROW_FOUND" == "false" ]; then + echo "------------------------- NOTE ---------------------------" + echo "There was an issue preparing the Arrow C++ libraries." + echo "See https://arrow.apache.org/docs/r/articles/install.html" + echo "----------------------------------------------------------" + exit 1 + fi } # Returns 1 if CMAKE options is set "ON", otherwise 0 function cmake_option() { ARROW_OPTS_CMAKE="$ARROW_HOME/lib/cmake/Arrow/ArrowOptions.cmake" - grep -cm1 "set($1 \"ON\")" $ARROW_OPTS_CMAKE + arrow_built_with $1 } function configure_dev() { diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d2db11e14a787..a81210f0ad914 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1050,6 +1050,7 @@ class RDictionaryConverter> template struct RConverterTrait; +#if ARROW_VERSION_MAJOR >= 15 template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && @@ -1061,6 +1062,14 @@ template struct RConverterTrait> { // not implemented }; +#else +template +struct RConverterTrait< + T, enable_if_t::value && !is_interval_type::value && + !is_extension_type::value>> { + using type = RPrimitiveConverter; +}; +#endif template struct RConverterTrait> { diff --git a/r/tools/check-versions.R b/r/tools/check-versions.R index 3d8cbf02a14c9..34b2ef680c547 100644 --- a/r/tools/check-versions.R +++ b/r/tools/check-versions.R @@ -20,6 +20,20 @@ args <- commandArgs(TRUE) # TESTING is set in test-check-version.R; it won't be set when called from configure test_mode <- exists("TESTING") +release_version_supported <- function(r_version, cpp_version) { + r_version <- package_version(r_version) + cpp_version <- package_version(cpp_version) + major <- function(x) as.numeric(x[1, 1]) + minimum_cpp_version <- package_version("13.0.0") + + allow_mismatch <- identical(tolower(Sys.getenv("ARROW_R_ALLOW_CPP_VERSION_MISMATCH", "false")), "true") + # If we allow a version mismatch we still need to cover the minimum version (13.0.0 for now) + # we don't allow newer C++ versions as new features without additional feature gates are likely to + # break the R package + version_valid <- cpp_version >= minimum_cpp_version && major(cpp_version) <= major(r_version) + allow_mismatch && version_valid || major(r_version) == major(cpp_version) +} + check_versions <- function(r_version, cpp_version) { r_parsed <- package_version(r_version) r_dev_version <- r_parsed[1, 4] @@ -39,20 +53,10 @@ check_versions <- function(r_version, cpp_version) { "*** > or retry with FORCE_BUNDLED_BUILD=true" ) cat(paste0(msg, "\n", collapse = "")) - } else if (r_is_patch && as.character(r_parsed[1, 1:3]) == cpp_version) { - # Patch releases we do for CRAN feedback get an extra x.y.z.1 version. - # These should work with the x.y.z C++ library (which never has .1 added) - cat( - sprintf( - "*** > Using C++ library version %s with R package %s\n", - cpp_version, - r_version - ) - ) - } else if (r_version != cpp_version) { + } else if (cpp_is_dev || !release_version_supported(r_version, cpp_parsed)) { cat( sprintf( - "**** Not using: C++ library version (%s) does not match R package (%s)\n", + "**** Not using: C++ library version (%s): not supported by R package version %s\n", cpp_version, r_version ) @@ -61,7 +65,12 @@ check_versions <- function(r_version, cpp_version) { # Add ALLOW_VERSION_MISMATCH env var to override stop()? (Could be useful for debugging) } else { # OK - cat(sprintf("**** C++ and R library versions match: %s\n", cpp_version)) + cat( + sprintf( + "**** C++ library version %s is supported by R version %s\n", + cpp_version, r_version + ) + ) } } diff --git a/r/tools/test-check-versions.R b/r/tools/test-check-versions.R index 9c284507b8801..f558648bed1e3 100644 --- a/r/tools/test-check-versions.R +++ b/r/tools/test-check-versions.R @@ -24,10 +24,10 @@ TESTING <- TRUE source("check-versions.R", local = TRUE) -test_that("check_versions", { +test_that("check_versions without mismatch", { expect_output( check_versions("10.0.0", "10.0.0"), - "**** C++ and R library versions match: 10.0.0", + "**** C++ library version 10.0.0 is supported by R version 10.0.0", fixed = TRUE ) expect_output( @@ -35,7 +35,7 @@ test_that("check_versions", { check_versions("10.0.0", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0", fixed = TRUE ) expect_output( @@ -43,20 +43,12 @@ test_that("check_versions", { check_versions("10.0.0.9000", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0.9000)", - fixed = TRUE - ) - expect_output( - expect_error( - check_versions("10.0.0.9000", "10.0.0"), - "version mismatch" - ), - "**** Not using: C++ library version (10.0.0) does not match R package (10.0.0.9000)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0.9000", fixed = TRUE ) expect_output( check_versions("10.0.0.3", "10.0.0"), - "*** > Using C++ library version 10.0.0 with R package 10.0.0.3", + "**** C++ library version 10.0.0 is supported by R version 10.0.0.3", fixed = TRUE ) expect_output( @@ -65,3 +57,25 @@ test_that("check_versions", { fixed = TRUE ) }) + +test_that("check_versions with mismatch", { + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "false")) + + expect_false( + release_version_supported("15.0.0", "13.0.0") + ) + + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "true")) + + expect_true( + release_version_supported("15.0.0", "13.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "16.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "12.0.0") + ) +})